from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Wav2Vec2FeatureExtractor
import librosa

audio, sr = librosa.load('file_path')
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)

model_path = "mushrafi88/wav2vec2_xlsr_bn_lm"
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Wav2Vec2FeatureExtractor,Wav2Vec2ProcessorWithLM
model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda")
processorlm = Wav2Vec2ProcessorWithLM.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)

inputs = processor(audio, sampling_rate=16_000, return_tensors="pt").to("cuda")
    
with torch.no_grad():
    logits = model(**inputs).logits
    transcription = processorlm.batch_decode(logits.cpu().numpy()).text
    pred_ids = torch.argmax(logits, dim=-1)[0]
    wav2vec2 = processor.decode(pred_ids)
    wav2vec2_lm = transcription[0]
    torch.cuda.empty_cache()
print(wav2vec2)
print(wav2vec2_lm)