ISL-camembert-beauty for Aspect Extraction - UPDATED (2023)
This is a RoBERTa-base model trained on Thai text dataset. The total size of uncompressed text is 78.5GB, and finetuned for aspect extraction with the private datasets. This model is suitable for Thai. (Training with sub-sentence)
<b>Labels</b>: "O", "B-Quality", "E-Quality", "I-Quality", "S-Quality", "B-Price", "E-Price", "S-Price", "I-Price", "I-Promotion", "B-Promotion", "E-Promotion", "S-Promotion"
Full classification example
import torch
import re
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from transformers import pipeline
from scipy.special import softmax
from pythainlp.ulmfit import (
process_thai,
replace_rep_after,
fix_html,
ungroup_emoji,
replace_wrep_post,
remove_space
)
# Preprocess text
def text_processing_newmm(text):
punctuations = """!()+-[]{;:'}"“”\,‘’<>/?@^$–^*_~ๆฯ•.."""
final = re.sub(
r"""@^(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z0-9][a-z0-9\-]*$@i""",
"",
str(text),
)
final = final.replace("เเ", "แ")
final = "".join(u for u in str(final) if u not in punctuations)
final = process_thai(final,
pre_rules=[replace_rep_after, fix_html],
post_rules=[ungroup_emoji,
replace_wrep_post,
remove_space]
)
final = [s for s in final if "\u200b" not in s]
final = [s for s in final if "xxrep" not in s]
final = [s for s in final if "xxwrep" not in s]
final = [x for x in final if not (x.isdigit()
or x[0] == '-' and x[1:].isdigit())]
final = "|".join(word for word in final)
return final
def parse_tokens(
clean_tokens, ignore_tokens=[], skip_split=False, keep_whitespace=False
):
if skip_split:
tokens_list = clean_tokens
else:
try:
tokens_list = clean_tokens.split("|")
except Exception as err:
tokens_list = []
tokens_list = [x.strip() for x in tokens_list]
# For skipping certain tokens
if ignore_tokens:
tokens_list = [x for x in tokens_list if x not in ignore_tokens]
# For keep or ignore whitespacing
if keep_whitespace is True:
pass
else:
tokens_list = [x for x in tokens_list if x]
return tokens_list
def format_words(words_list):
formatted_words = " ".join(words_list)
return formatted_words
def group_entities_iobes(ner_results):
grouped_entities = []
current_entity = None
for entity in ner_results:
entity_tag = entity['entity']
if entity_tag.startswith('B-'):
if current_entity is not None:
grouped_entities.append(current_entity)
current_entity = {
'word': entity['word'],
'entity': entity_tag[2:],
'start': entity['start'],
'end': entity['end']
}
elif entity_tag.startswith('I-'):
if current_entity is not None and current_entity['entity'] == entity_tag[2:]:
current_entity['word'] += entity['word']
current_entity['end'] = entity['end']
else:
if current_entity is not None:
grouped_entities.append(current_entity)
current_entity = None
elif entity_tag.startswith('E-'):
if current_entity is not None and current_entity['entity'] == entity_tag[2:]:
current_entity['word'] += entity['word']
current_entity['end'] = entity['end']
grouped_entities.append(current_entity)
current_entity = None
else:
if current_entity is not None:
grouped_entities.append(current_entity)
current_entity = None
else: # 'S-' and 'O-' tags
if current_entity is not None:
grouped_entities.append(current_entity)
grouped_entities.append(entity)
current_entity = None
if current_entity is not None:
grouped_entities.append(current_entity)
return grouped_entities
model = "praramnine/isl-camembert-beauty-aspect-v1"
tokenizer = AutoTokenizer.from_pretrained(model)
config = AutoConfig.from_pretrained(model)
model = AutoModelForTokenClassification.from_pretrained(model)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
text = "แต่ต้องใช้รองพื้นช่วย แต่ใช้แล้วรู้สึกบางเบาแล้วหน้าขาวเนียนๆดี เนื้อครีมเราว่าดูหนืดๆไปหน่อย แต่รวมๆชอบนะค่ะ แต่ถ้าหมดแล้วอาจลองตัวอื่นหน่อย"
text = text_processing_newmm(text)
text = parse_tokens(text)
formatted_text = format_words(text)
ner_results = nlp(formatted_text)
ner_results_cleaned = [entity for entity in ner_results if entity['word'] != '▁']
print(ner_results_cleaned)
Output:
[{'entity': 'B-Price', 'score': 0.993042, 'index': 12, 'word': 'ราคา', 'start': 22, 'end': 26}, {'entity': 'I-Price', 'score': 0.9918081, 'index': 14, 'word': 'ไม่', 'start': 27, 'end': 30}, {'entity': 'I-Price', 'score': 0.9626237, 'index': 16, 'word': 'แพง', 'start': 31, 'end': 34}, {'entity': 'B-Quality', 'score': 0.9992849, 'index': 25, 'word': 'ไม่', 'start': 52, 'end': 55}, {'entity': 'E-Quality', 'score': 0.9492479, 'index': 27, 'word': 'แพ้', 'start': 56, 'end': 59}, {'entity': 'B-Quality', 'score': 0.9918962, 'index': 29, 'word': 'สิว', 'start': 60, 'end': 63}, {'entity': 'I-Quality', 'score': 0.99981385, 'index': 31, 'word': 'ไม่', 'start': 64, 'end': 67}, {'entity': 'E-Quality', 'score': 0.9970843, 'index': 33, 'word': 'ขึ้น', 'start': 68, 'end': 72}]
grouped_entities = group_entities_iobes(ner_results_cleaned)
print(grouped_entities)
Output:
[{'word': 'ราคาไม่แพง', 'entity': 'Price', 'start': 22, 'end': 34}, {'word': 'ไม่แพ้', 'entity': 'Quality', 'start': 52, 'end': 59}, {'word': 'สิวไม่ขึ้น', 'entity': 'Quality', 'start': 60, 'end': 72}]