🐃🇹🇭 Buffala-LoRa-TH

Buffala-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the Stanford Alpaca (TH Translated), Wisesignt, WikiTH, Pantip and IAppQ&A dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit the project's website.

Issues and what next?

The model still lacks a significant amount of world knowledge, so it is necessary to fine-tune it on larger Thai datasets > Next version: CCNet,OSCAR,thWiki
Currently, there is no translation prompt. We plan to fine-tune the model on the SCB Thai-English dataset soon.
The model works well with the LangChain Search agent (Serpapi), which serves as a hotfix for world knowledge. > Plan for Spaces with search chain demo
Lacked of chat capabilities, waiting for LangChain implementation.
Colab demo.
Github for datasets and training notebook.

How to use

import torch
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer


device = "cuda"

tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
model = LlamaForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    "Thaweewat/thai-buffala-lora-7b-v0-1",
    torch_dtype=torch.float16,
)

def generate_prompt(instruction, input=None):

    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input + get_list_and_snippet(instruction)}
### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{get_list_and_snippet(instruction)}
### Response:"""

if not LOAD_8BIT:
    model.half()  

model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)


def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output.split("### Response:")[1].strip()

evaluate(instruction = "จงแก้สมการต่อไปนี้ X เท่ากับเท่าไหร่", input="X+Y=15 and Y=7")
""" X = 8 """