!python /notebooks/seq_2_seq/run_seq2seq_qa.py
--model_name_or_path google/mt5-large
--dataset_name 9wimu9/SinQuAD
--context_column context
--question_column question
--answer_column answers
--do_train
--do_eval
--per_device_train_batch_size 8
--learning_rate 1e-3
--num_train_epochs 1
--max_seq_length 384
--doc_stride 128
--output_dir mt5-large-v1
--logging_steps 1
--bf16
--gradient_accumulation_steps 4
--gradient_checkpointing True
--optim adafactor
{ "eval/loss":0.9061169624328612, "_timestamp":1686240530.1377208, "_step":370, "_runtime":902.276704788208, "train/global_step":369, "eval/steps_per_second":7.803, "train/train_steps_per_second":0.425, "_wandb.runtime":902, "train/epoch":1, "train/total_flos":26479261148774400, "train/loss":0.1842, "train/train_loss":0.6567919482060565, "train/learning_rate":0, "train/train_runtime":868.8715, "eval/samples_per_second":62.341, "train/train_samples_per_second":13.588, "eval/runtime":25.12 }