from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Wav2Vec2FeatureExtractor
import librosa
audio, sr = librosa.load('file_path')
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
model_path = "mushrafi88/wav2vec2_xlsr_bn_lm"
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Wav2Vec2FeatureExtractor,Wav2Vec2ProcessorWithLM
model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda")
processorlm = Wav2Vec2ProcessorWithLM.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)
inputs = processor(audio, sampling_rate=16_000, return_tensors="pt").to("cuda")
with torch.no_grad():
logits = model(**inputs).logits
transcription = processorlm.batch_decode(logits.cpu().numpy()).text
pred_ids = torch.argmax(logits, dim=-1)[0]
wav2vec2 = processor.decode(pred_ids)
wav2vec2_lm = transcription[0]
torch.cuda.empty_cache()
print(wav2vec2)
print(wav2vec2_lm)