英文

波斯语语音识别的经过优化的XLSR-53大模型

使用 Common Voice 6.1 的训练和验证数据对 facebook/wav2vec2-large-xlsr-53 进行了波斯语的优化。在使用此模型时,请确保您的语音输入采样率为16kHz。

感谢 OVHcloud 慷慨提供的GPU积分,此模型已经过优化。

训练时使用的脚本可以在此处找到: https://github.com/jonatasgrosman/wav2vec2-sprint

用法

可以直接使用模型(无需语言模型),如下所示...

使用 HuggingSound 库:

from huggingsound import SpeechRecognitionModel

model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-persian")
audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]

transcriptions = model.transcribe(audio_paths)

编写自己的推理脚本:

import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

LANG_ID = "fa"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-persian"
SAMPLES = 5

test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

# Preprocessing the datasets.
# We need to read the audio files as arrays
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"].upper()
    return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Reference:", test_dataset[i]["sentence"])
    print("Prediction:", predicted_sentence)
Reference Prediction
از مهمونداری کنار بکشم از مهمانداری کنار بکشم
برو از مهرداد بپرس. برو از ماقدعاد به پرس
خب ، تو چیكار می كنی؟ خوب تو چیکار می کنی
مسقط پایتخت عمان در عربی به معنای محل سقوط است مسقط پایتخت عمان در عربی به بعنای محل سقوط است
آه، نه اصلاُ! اهنه اصلا
توانست توانست
قصیده فن شعر میگوید ای دوستان قصیده فن شعر میگوید ایدوستون
دو استایل متفاوت دارین دوبوست داریل و متفاوت بری
دو روز قبل از کریسمس ؟ اون مفتود پش پشش
ساعت های کاری چیست؟ این توری که موشیکل خب

评估

可以使用以下方法评估波斯语的Common Voice测试数据上的模型。

import torch
import re
import librosa
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

LANG_ID = "fa"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-persian"
DEVICE = "cuda"

CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
                   "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
                   "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
                   "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
                   "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]

test_dataset = load_dataset("common_voice", LANG_ID, split="test")

wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py

chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
model.to(DEVICE)

# Preprocessing the datasets.
# We need to read the audio files as arrays
def speech_file_to_array_fn(batch):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
    return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)

# Preprocessing the datasets.
# We need to read the audio files as arrays
def evaluate(batch):
    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to(DEVICE), attention_mask=inputs.attention_mask.to(DEVICE)).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_strings"] = processor.batch_decode(pred_ids)
    return batch

result = test_dataset.map(evaluate, batched=True, batch_size=8)

predictions = [x.upper() for x in result["pred_strings"]]
references = [x.upper() for x in result["sentence"]]

print(f"WER: {wer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")
print(f"CER: {cer.compute(predictions=predictions, references=references, chunk_size=1000) * 100}")

测试结果:

在下表中,我报告了模型的词错误率(WER)和字符错误率(CER)。我还对其他模型运行了上述评估脚本(于2021年4月22日)。请注意,下表可能显示与已报告的结果不同的结果,这可能是由于使用的其他评估脚本的某些特定性引起的。

Model WER CER
jonatasgrosman/wav2vec2-large-xlsr-53-persian 30.12% 7.37%
m3hrdadfi/wav2vec2-large-xlsr-persian-v2 33.85% 8.79%
m3hrdadfi/wav2vec2-large-xlsr-persian 34.37% 8.98%

引用

如果您想引用此模型,可以使用此引用:

@misc{grosman2021xlsr53-large-persian,
  title={Fine-tuned {XLSR}-53 large model for speech recognition in {P}ersian},
  author={Grosman, Jonatas},
  howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-persian}},
  year={2021}
}