training_args = TrainingArguments( num_train_epochs=20, warmup_steps=250, per_device_train_batch_size=BATCH_SIZE, weight_decay=0.01, learning_rate=2e-4, fp16=True, optim="adafactor", )

Step Training Loss 500 3.420900 1000 2.733200 1500 2.685000 2000 2.604700 2500 2.587000 3000 2.569500 3500 2.522200 4000 2.513200 4500 2.502900 5000 2.476400 5500 2.443500 6000 2.445200 6500 2.431400 7000 2.390000 7500 2.387500 8000 2.395100 8500 2.336500 9000 2.346100 9500 2.355200 10000 2.309700 10500 2.299300 11000 2.311000 11500 2.281100 12000 2.260600 12500 2.272100 13000 2.254900 13500 2.234800 14000 2.229200 14500 2.241900 15000 2.192200 15500 2.203900 16000 2.215100 16500 2.177700 17000 2.177100 17500 2.179900 18000 2.149800 18500 2.153600 19000 2.154600 19500 2.140200 20000 2.123900 20500 2.140300 21000 2.118600 21500 2.103400 22000 2.113900 22500 2.113400 23000 2.084700 23500 2.089700 24000 2.099500 24500 2.083100 25000 2.076500 25500 2.077000 26000 2.063500 26500 2.071400 27000 2.059800 27500 2.059200 28000 2.048300 28500 2.054500 29000 2.048300 29500 2.041100 30000 2.044400 30500 2.043500 31000 2.035000 31500 2.036400 32000 2.042500

TrainOutput(global_step=32100, training_loss=2.256444251039689, metrics={'train_runtime': 28434.5222, 'train_samples_per_second': 72.244, 'train_steps_per_second': 1.129, 'total_flos': 0.0, 'train_loss': 2.256444251039689, 'epoch': 20.0})