Language Identification

Canine for Language Identification

Canine model trained on WiLI-2018 dataset to identify the language of a text.



Test Results


Dictionary to return English names for a label id:

import datasets
import pycountry
def int_to_lang():
    dataset = datasets.load_dataset('wili_2018')
    # names for languages not in iso-639-3 from wikipedia
    non_iso_languages = {'roa-tara': 'Tarantino', 'zh-yue': 'Cantonese', 'map-bms': 'Banyumasan',
                         'nds-nl': 'Dutch Low Saxon', 'be-tarask': 'Belarusian'}
    # create dictionary from data set labels to language names
    lab_to_lang = {}
    for i, lang in enumerate(dataset['train'].features['label'].names):
        full_lang = pycountry.languages.get(alpha_3=lang)
        if full_lang:
            lab_to_lang[i] =
            lab_to_lang[i] = non_iso_languages[lang]
    return lab_to_lang

