generated_from_trainer

<!-- This model card has been generated automatically according to the information the Trainer had access to. You should probably proofread and complete it, then remove this comment. -->

wav2vec2-large-xls-r-300m-hi

This model is a fine-tuned version of facebook/wav2vec2-xls-r-300m on the None dataset. It achieves the following results on the evaluation set:

View the results on Kaggle Notebook: https://www.kaggle.com/code/kingabzpro/wav2vec-2-eval

Evaluation

import torch
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import unicodedata
import re


test_dataset = load_dataset("mozilla-foundation/common_voice_8_0", "hi", split="test")
wer = load_metric("wer")
cer = load_metric("cer")

processor = Wav2Vec2Processor.from_pretrained("kingabzpro/wav2vec2-large-xls-r-300m-hi")
model = Wav2Vec2ForCTC.from_pretrained("kingabzpro/wav2vec2-large-xls-r-300m-hi")
model.to("cuda")


# Preprocessing the datasets.
def speech_file_to_array_fn(batch):
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\’\'\|\&\–]'
    remove_en = '[A-Za-z]'
    batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"].lower())
    batch["sentence"] = re.sub(remove_en, "", batch["sentence"]).lower()
    batch["sentence"] = unicodedata.normalize("NFKC", batch["sentence"])

    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
    batch["speech"] = speech_array
    return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)

# Preprocessing the datasets.
# We need to read the aduio files as arrays
def evaluate(batch):
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

  with torch.no_grad():
      logits = model(inputs.input_values.to("cuda")).logits

      pred_ids = torch.argmax(logits, dim=-1)
      batch["pred_strings"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
      return batch

result = test_dataset.map(evaluate, batched=True, batch_size=8)

print("WER: {}".format(wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
print("CER: {}".format(cer.compute(predictions=result["pred_strings"], references=result["sentence"])))

WER: 0.5209850206372026

CER: 0.17902923538230883

Training hyperparameters

The following hyperparameters were used during training:

Training results

Training Loss Epoch Step Validation Loss Wer Cer
7.0431 19.05 300 3.4423 1.0 1.0
2.3233 38.1 600 0.5965 0.4757 0.1329
0.5676 57.14 900 0.3962 0.3584 0.0954
0.3611 76.19 1200 0.3651 0.3190 0.0820
0.2996 95.24 1500 0.3611 0.2992 0.0786

Framework versions