Python实现语音识别(SpeechRecognition)
1、简介
https://pypi.org/project/SpeechRecognition/
https://github.com/Uberi/speech_recognition
SpeechRecognition用于执行语音识别的库,支持多个引擎和 API,在线和离线。
1 | recognize_bing():Microsoft Bing Speech |
以上几个中只有 recognition_sphinx()可与CMU Sphinx 引擎脱机工作, 其他六个都需要连接互联网。另外,SpeechRecognition 附带 Google Web Speech API 的默认 API 密钥,可直接使用它。其他的 API 都需要使用 API 密钥或用户名/密码组合进行身份验证。
2、安装SpeechRecognition
安装库SpeechRecognition:
1 | #python -m pip install --upgrade pip |
检查版本
1 | import speech_recognition as sr |
硬件检查
1 | import speech_recognition as sr |
3、安装pyaudio
1 | pip install pyaudio |
4、安装pocketsphinx(offline)
1 | pip install pocketsphinx |
4.1 中文声学模型、语言模型和字典文件;
pocketsphinx需要安装的中文语言、声学模型。
https://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/Mandarin/
具体可以参考以下文章:
测试脚本:
- 导入文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20import speech_recognition as sr
r = sr.Recognizer() #调用识别器
test = sr.AudioFile("chinese.flac") #导入语音文件
with test as source:
# r.adjust_for_ambient_noise(source)
audio = r.record(source) #使用 record() 从文件中获取数据
type(audio)
# c=r.recognize_sphinx(audio, language='zh-cn') #识别输出
c=r.recognize_sphinx(audio, language='en-US') #识别输出
print(c)
# recognize speech using Sphinx
try:
print("Sphinx thinks you said " + r.recognize_sphinx(audio))
except sr.UnknownValueError:
print("Sphinx could not understand audio")
except sr.RequestError as e:
print("Sphinx error; {0}".format(e)) - 调用麦克风录音
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16import speech_recognition as sr
# obtain audio from the microphone
r = sr.Recognizer()
with sr.Microphone() as source:
print("Say something!")
audio = r.listen(source)
# recognize speech using Sphinx
try:
print("Sphinx thinks you said " + r.recognize_sphinx(audio))
except sr.UnknownValueError:
print("Sphinx could not understand audio")
except sr.RequestError as e:
print("Sphinx error; {0}".format(e))
5、安装语音识别库
5.1 安装Vosk (offline)
1 | python3 -m pip install vosk |
您还必须安装 Vosk 模型:
以下是可供下载的模型。您必须将它们放在项目的模型文件夹中,例如“your-project-folder/models/your-vosk-model”
https://alphacephei.com/vosk/models
测试脚本:
注: 在测试脚本的所在文件夹,新建model子文件夹,然后把上面下载的模型解压到里面如下:
1 | import speech_recognition as sr |
5.2 安装Whisper(offline)
1 | pip install zhconv |
whisper用法:
- 读取文件
1
2
3
4
5
6import whisper
if __name__ == '__main__':
model = whisper.load_model("tiny")
result = model.transcribe("audio.mp3", fp16=False, language="Chinese")
print(result["text"]) - 实时转录(录音完成后识别,目前没找到可以实时逐句翻译,但可以结合处理)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46import whisper
import zhconv
import wave # 使用wave库可读、写wav类型的音频文件
import pyaudio # 使用pyaudio库可以进行录音,播放,生成wav文件
def record(time): # 录音程序
# 定义数据流块
CHUNK = 1024 # 音频帧率(也就是每次读取的数据是多少,默认1024)
FORMAT = pyaudio.paInt16 # 采样时生成wav文件正常格式
CHANNELS = 1 # 音轨数(每条音轨定义了该条音轨的属性,如音轨的音色、音色库、通道数、输入/输出端口、音量等。可以多个音轨,不唯一)
RATE = 16000 # 采样率(即每秒采样多少数据)
RECORD_SECONDS = time # 录音时间
WAVE_OUTPUT_FILENAME = "./output.wav" # 保存音频路径
p = pyaudio.PyAudio() # 创建PyAudio对象
stream = p.open(format=FORMAT, # 采样生成wav文件的正常格式
channels=CHANNELS, # 音轨数
rate=RATE, # 采样率
input=True, # Ture代表这是一条输入流,False代表这不是输入流
frames_per_buffer=CHUNK) # 每个缓冲多少帧
print("* recording") # 开始录音标志
frames = [] # 定义frames为一个空列表
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): # 计算要读多少次,每秒的采样率/每次读多少数据*录音时间=需要读多少次
data = stream.read(CHUNK) # 每次读chunk个数据
frames.append(data) # 将读出的数据保存到列表中
print("* done recording") # 结束录音标志
stream.stop_stream() # 停止输入流
stream.close() # 关闭输入流
p.terminate() # 终止pyaudio
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') # 以’wb‘二进制流写的方式打开一个文件
wf.setnchannels(CHANNELS) # 设置音轨数
wf.setsampwidth(p.get_sample_size(FORMAT)) # 设置采样点数据的格式,和FOMART保持一致
wf.setframerate(RATE) # 设置采样率与RATE要一致
wf.writeframes(b''.join(frames)) # 将声音数据写入文件
wf.close() # 数据流保存完,关闭文件
if __name__ == '__main__':
model = whisper.load_model("tiny")
record(3) # 定义录音时间,单位/s
result = model.transcribe("output.wav",language='Chinese',fp16 = True)
s = result["text"]
s1 = zhconv.convert(s, 'zh-cn')
print(s1)
测试脚本:
1 | import speech_recognition as sr |
6. 测试
6.1 fastapi
1 | import json |
6.2 recognize_sphinx
1 | import logging |
6.3 语音生成音频文件
- 方法1:
1
2
3
4
5
6
7
8
9
10
11
12
13
14import speech_recognition as sr
# Use SpeechRecognition to record 使用语音识别包录制音频
def my_record(rate=16000):
r = sr.Recognizer()
with sr.Microphone(sample_rate=rate) as source:
print("please say something")
audio = r.listen(source)
with open("voices/myvoices.wav", "wb") as f:
f.write(audio.get_wav_data())
print("录音完成!")
my_record() - 方法2:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37import wave
from pyaudio import PyAudio, paInt16
framerate = 16000 # 采样率
num_samples = 2000 # 采样点
channels = 1 # 声道
sampwidth = 2 # 采样宽度2bytes
FILEPATH = 'voices/myvoices.wav'
def save_wave_file(filepath, data):
wf = wave.open(filepath, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(sampwidth)
wf.setframerate(framerate)
wf.writeframes(b''.join(data))
wf.close()
#录音
def my_record():
pa = PyAudio()
#打开一个新的音频stream
stream = pa.open(format=paInt16, channels=channels,
rate=framerate, input=True, frames_per_buffer=num_samples)
my_buf = [] #存放录音数据
t = time.time()
print('正在录音...')
while time.time() < t + 10: # 设置录音时间(秒)
#循环read,每次read 2000frames
string_audio_data = stream.read(num_samples)
my_buf.append(string_audio_data)
print('录音结束.')
save_wave_file(FILEPATH, my_buf)
stream.close()
7、进阶用法
7.1 用vosk、sr实时识别(目前测试流程最合适的)
可以用这个方法推导其他语音识别的用法
1 | # -*- coding: utf-8 -*- |
评论