rm_final_28531 + rm_final_llama2-cot-additional_1495 + truthful_QA_817
step = 9600