Training procedure

Framework versions

Loading Model + Adapters

from peft import PeftModel, PeftModelForCausalLM, LoraConfig
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model_id = "meta-llama/Llama-2-7b-hf" # gated
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                             use_auth_token=True,
                                             torch_dtype=torch.bfloat16,
                                             use_cache=True,
                                             device_map="auto")
model.config.pretraining_tp = 1

model = PeftModel.from_pretrained(model, 'elhindih/lora-checkpoint-2224')

Faster Inference

from datasets import Dataset

BASE = "You are a helpful AI assistant. "
prompt_answer = BASE + "Given the context and the question at the end, your task is to provide most relevant answer while explaining your thoughts step by step."
prompt_followup0 = BASE + "Given the context and a question your task is to provide relevant answer." 
prompt_followup1 = BASE + "Given the context of a conversation and a followup question, your task is to generate most relevant question about the context. Only generate question and not the answer." 
prompt_summary = "You are a helpful AI assistant who is expert at summarizing an input paragraph. You provide most relevant summary to the paragraph while keeping in view the guidelines at the end of the context."

sys_prompt=  {
    "followup1" : prompt_followup0,
    "followup0" : prompt_followup1,
    "followup2" : prompt_followup1,
    "answer1" : prompt_answer,
    "answer0" : prompt_answer,
    "answer2" : prompt_answer,
    "followup3" : prompt_followup1,
    "augment" : prompt_summary,
    "answer3" : prompt_answer
}
# dataset = Dataset.from_pandas(df_new)

def format_instruction(sample):
	return f"""### Instruction:
  {sys_prompt[sample['name']]}

  ### Input:
  {sample['prompt']}

  ### Response:
  """

# Setup generation config
generation_config = model.generation_config
generation_config.max_new_tokens = 300
generation_config.temperature = 0.7
generation_config.top_p = 0.95
generation_config.num_return_sequences = 1

from time import perf_counter
device = "cuda:0"

def predict_tokens(item):

    st = perf_counter()
    tokenized = tokenizer(item, return_tensors="pt")
    # with torch.cuda.amp.autocast():
    with torch.inference_mode():
        outputs = model.generate(
          input_ids = torch.Tensor(tokenized["input_ids"]).to(torch.int64).to(device),
          attention_mask=torch.Tensor(tokenized["attention_mask"]).to(torch.int64).to(device),
          generation_config=generation_config,
          eos_token_id=tokenizer.eos_token_id,
          use_cache=True
        )
        completion = tokenizer.decode(outputs[0], skip_special_tokens=False)
        # completion_begin = completion.find(">>ANSWER<<")
        print(f"Time taken to generate: {perf_counter() - st:.2f}")
        return {"pred_completion": completion }