Training procedure

Framework versions

How to use: Code below

!pip install -q bitsandbytes datasets accelerate loralib !pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git

import torch torch.cuda.is_available()

import os os.environ["CUDA_VISIBLE_DEVICES"]="0" import torch import torch.nn as nn import bitsandbytes as bnb from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained( "bigscience/bloom-3b", torch_dtype=torch.float16, device_map='auto', )

tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")

print(model)

for param in model.parameters(): param.requires_grad = False # freeze the model - train adapters later if param.ndim == 1: # cast the small parameters (e.g. layernorm) to fp32 for stability param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable() # reduce number of stored activations model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential): def forward(self, x): return super().forward(x).to(torch.float32) model.lm_head = CastOutputToFloat(model.lm_head)

Helper Function

def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. """ trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() print( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" )

from peft import LoraConfig, get_peft_model

config = LoraConfig( r=8, lora_alpha=16, target_modules=["query_key_value"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" )

r, the dimension of the low-rank matrices

lora_alpha, scaling factor for the weight matrices

lora_dropout, dropout probability of the LoRA layers

bias, set to all to train all bias parameters

model = get_peft_model(model, config) print_trainable_parameters(model)

from datasets import load_dataset qa_dataset = load_dataset("squad_v2")

def create_prompt(context, question, answer): if len(answer["text"]) < 1: answer = "Cannot Find Answer" else: answer = answer["text"][0] prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n{answer}</s>" return prompt_template

mapped_qa_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['context'], samples['question'], samples['answers'])))

torch.cuda.empty_cache()

Train MOdel on SQUAD DAtaset

import transformers

trainer = transformers.Trainer( # doc for parameter https://huggingface.co/docs/transformers/main_classes/trainer model=model, train_dataset=mapped_qa_dataset["train"], eval_dataset=mapped_qa_dataset["validation"], args=transformers.TrainingArguments( per_device_train_batch_size=4, gradient_accumulation_steps=4, # evaluation_strategy="steps", # eval_steps=1000, warmup_steps=100, max_steps=100, learning_rate=2e-3, fp16=True, logging_steps=1, output_dir='outputs', ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) )

#Training Arguments

args = TrainingArguments(

output_dir = "/Content/mod",

evaluation_strategy = "epoch", #Can be epoch or steps

learning_rate=2e-5, #According to original bert paper

per_device_train_batch_size=32, #According to original bert paper

per_device_eval_batch_size=32,

num_train_epochs=3, #should be inbetween 2-4 according to the paper

weight_decay=0.01,

prediction_loss_only = True

)

For example if you use evaluation_strategy="steps" and eval_steps=2000 in the TrainingArguments,

you will get training and validation loss for every 2000 steps. If you wanna do it on an epoch level

\I think you need to set evaluation_strategy="epoch" and logging_strategy="epoch" in the

TrainingArguments class.

model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train()

Push to HUB

HUGGING_FACE_USER_NAME = "aloksingh2130" model_name = "Bloom-3B-Squad-v2" model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)