ISL-camembert-beauty for Aspect Extraction - UPDATED (2023)

This is a RoBERTa-base model trained on Thai text dataset. The total size of uncompressed text is 78.5GB, and finetuned for aspect extraction with the private datasets. This model is suitable for Thai. (Training with sub-sentence)

<b>Labels</b>: "O", "B-Quality", "E-Quality", "I-Quality", "S-Quality", "B-Price", "E-Price", "S-Price", "I-Price", "I-Promotion", "B-Promotion", "E-Promotion", "S-Promotion"

Full classification example

import torch
import re
import numpy as np

from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from transformers import pipeline
from scipy.special import softmax
from pythainlp.ulmfit import (
    process_thai,
    replace_rep_after,
    fix_html,
    ungroup_emoji,
    replace_wrep_post,
    remove_space
)

# Preprocess text
def text_processing_newmm(text):
    punctuations = """!()+-[]{;:'}"“”\,‘’<>/?@^$–^*_~ๆฯ•.."""
    final = re.sub(
        r"""@^(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z0-9][a-z0-9\-]*$@i""",
        "",
        str(text),
    )
    final = final.replace("เเ", "แ")
    final = "".join(u for u in str(final) if u not in punctuations)
    final = process_thai(final,
                        pre_rules=[replace_rep_after, fix_html],
                        post_rules=[ungroup_emoji,
                                    replace_wrep_post,
                                    remove_space]
                      )
    final = [s for s in final if "\u200b" not in s]
    final = [s for s in final if "xxrep" not in s]
    final = [s for s in final if "xxwrep" not in s]
    final = [x for x in final if not (x.isdigit() 
                                         or x[0] == '-' and x[1:].isdigit())]
    final = "|".join(word for word in final)
    return final

def parse_tokens(
    clean_tokens, ignore_tokens=[], skip_split=False, keep_whitespace=False
):
    if skip_split:
        tokens_list = clean_tokens
    else:
        try:
            tokens_list = clean_tokens.split("|")
        except Exception as err:
            tokens_list = []

    tokens_list = [x.strip() for x in tokens_list]

    # For skipping certain tokens
    if ignore_tokens:
        tokens_list = [x for x in tokens_list if x not in ignore_tokens]

    # For keep or ignore whitespacing
    if keep_whitespace is True:
        pass
    else:
        tokens_list = [x for x in tokens_list if x]

    return tokens_list

def format_words(words_list):
    formatted_words = " ".join(words_list)
    return formatted_words

def group_entities_iobes(ner_results):
    grouped_entities = []
    current_entity = None

    for entity in ner_results:
        entity_tag = entity['entity']
        if entity_tag.startswith('B-'):
            if current_entity is not None:
                grouped_entities.append(current_entity)
            current_entity = {
                'word': entity['word'],
                'entity': entity_tag[2:],
                'start': entity['start'],
                'end': entity['end']
            }
        elif entity_tag.startswith('I-'):
            if current_entity is not None and current_entity['entity'] == entity_tag[2:]:
                current_entity['word'] += entity['word']
                current_entity['end'] = entity['end']
            else:
                if current_entity is not None:
                    grouped_entities.append(current_entity)
                current_entity = None
        elif entity_tag.startswith('E-'):
            if current_entity is not None and current_entity['entity'] == entity_tag[2:]:
                current_entity['word'] += entity['word']
                current_entity['end'] = entity['end']
                grouped_entities.append(current_entity)
                current_entity = None
            else:
                if current_entity is not None:
                    grouped_entities.append(current_entity)
                current_entity = None
        else:  # 'S-' and 'O-' tags
            if current_entity is not None:
                grouped_entities.append(current_entity)
            grouped_entities.append(entity)
            current_entity = None

    if current_entity is not None:
        grouped_entities.append(current_entity)

    return grouped_entities


model = "praramnine/isl-camembert-beauty-aspect-v1"
tokenizer = AutoTokenizer.from_pretrained(model)
config = AutoConfig.from_pretrained(model)
model = AutoModelForTokenClassification.from_pretrained(model)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
text = "แต่ต้องใช้รองพื้นช่วย แต่ใช้แล้วรู้สึกบางเบาแล้วหน้าขาวเนียนๆดี เนื้อครีมเราว่าดูหนืดๆไปหน่อย แต่รวมๆชอบนะค่ะ แต่ถ้าหมดแล้วอาจลองตัวอื่นหน่อย"
text = text_processing_newmm(text)
text = parse_tokens(text)
formatted_text = format_words(text)

ner_results = nlp(formatted_text)
ner_results_cleaned = [entity for entity in ner_results if entity['word'] != '▁']

print(ner_results_cleaned)

Output:

[{'entity': 'B-Price', 'score': 0.993042, 'index': 12, 'word': 'ราคา', 'start': 22, 'end': 26}, {'entity': 'I-Price', 'score': 0.9918081, 'index': 14, 'word': 'ไม่', 'start': 27, 'end': 30}, {'entity': 'I-Price', 'score': 0.9626237, 'index': 16, 'word': 'แพง', 'start': 31, 'end': 34}, {'entity': 'B-Quality', 'score': 0.9992849, 'index': 25, 'word': 'ไม่', 'start': 52, 'end': 55}, {'entity': 'E-Quality', 'score': 0.9492479, 'index': 27, 'word': 'แพ้', 'start': 56, 'end': 59}, {'entity': 'B-Quality', 'score': 0.9918962, 'index': 29, 'word': 'สิว', 'start': 60, 'end': 63}, {'entity': 'I-Quality', 'score': 0.99981385, 'index': 31, 'word': 'ไม่', 'start': 64, 'end': 67}, {'entity': 'E-Quality', 'score': 0.9970843, 'index': 33, 'word': 'ขึ้น', 'start': 68, 'end': 72}]
grouped_entities = group_entities_iobes(ner_results_cleaned)
print(grouped_entities)

Output:

[{'word': 'ราคาไม่แพง', 'entity': 'Price', 'start': 22, 'end': 34}, {'word': 'ไม่แพ้', 'entity': 'Quality', 'start': 52, 'end': 59}, {'word': 'สิวไม่ขึ้น', 'entity': 'Quality', 'start': 60, 'end': 72}]