header

normal mode unavailable here

root/nn/restoreJ-no-help-4/out

--- Directories ---
--- Files ---
dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_cos.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_dot.npydim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False_loss_mse.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_cos.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_dot.npydim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True_loss_mse.npyloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-cos-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-dot-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=cos-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mix-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.001-loss=mse-opti=adam-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.01-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=cos-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mix-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(2**6),log(2**-8)].pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-bwd_lr=0.1-loss=mse-opti=sgd-σs=[id,gelu,exp(nocap),log(nocap)].pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[20,30]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.00390625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.015625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.0625-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=0.25-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=cos-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=dot-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mix-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=adam-bwd_lr=0.001-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.01-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(2**6),log(2**-8)]-θbatch=True.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=False.pngloss-mse-dim=[40,60]-dim_sample=1.0-loss=mse-opti=sgd-bwd_lr=0.1-σs=[id,gelu,exp(nocap),log(nocap)]-θbatch=True.png