normal mode unavailable here
--- Directories ---
--- Files ---
dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npyloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[2,3]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=dot-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].png