识别特定语音信号

修改自己的pb文件位置，和args.list的大小即可使用
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 10 16:59:03 2018

@author: chen
"""

import librosa
import tensorflow as tf
import pyaudio
import sys
import time
import array
import numpy as np
import queue
from collections import deque
from easydict import EasyDict

args=EasyDict()
args.duration=1
args.rate=44100
args.hop_length=80
args.n_mels=40
args.rt_oversamples=10
args.samples=args.rate*args.duration
args.list=(args.n_mels,1+int(np.floor(args.samples/args.hop_length)),1)
args.rt_process_count=1
args.model='alexnet'
args.rt_chunk_samples = args.rate // args.rt_oversamples
args.mels_onestep_samples = args.rt_chunk_samples * args.rt_process_count
args.mels_convert_samples=args.samples + args.mels_onestep_samples
args.fmax = args.rate // 2
args.n_fft = args.n_mels * 20
args.labels = ['dog_bark', 'children_playing', 'car_horn', 'air_conditioner',
               'street_music', 'gun_shot', 'siren', 'engine_idling', 'jackhammer',
               'drilling']

args.mels_convert_samples = args.samples + args.mels_onestep_samples

graph_file="E:\\ML\\UrbanSound8K\\code\\UrbanSound8K\\others sussfer\\first\\99.pb"

def audio_to_melspectrogram(args, audio):
    spectrogram = librosa.feature.melspectrogram(audio,
                                                 sr=args.rate,
                                                 n_mels=args.n_mels,
                                                 hop_length=args.hop_length,
                                                 n_fft=args.n_fft,
                                                 fmin=20,
                                                 fmax=args.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram


def callback(in_data, frame_count, time_info, status):
    wave = array.array('b', in_data)
    raw_frames.put(wave, True)
    return (None, pyaudio.paContinue)

def on_predicted(ensembled_pred):
    result = np.argmax(ensembled_pred)
    print(args.labels[result], ensembled_pred[result])
def samplewise_normalize_audio_X(X):
    for i in range(len(X)):
        X[i] -= np.min(X[i])
        X[i] /= (np.max(np.abs(X[i])) + 1.0)

def geometric_mean_preds(_preds):
    preds = _preds.copy()
    for i in range(1, preds.shape[0]):
        preds[0] = np.multiply(preds[0], preds[i])
    return np.power(preds[0], 1/preds.shape[0])

raw_frames = queue.Queue(maxsize=100)
raw_audio_buffer = []
pred_queue = deque(maxlen=10)
def main_process(model, on_predicted):

    global raw_audio_buffer
    while not raw_frames.empty():
        raw_audio_buffer.extend(raw_frames.get())
        if len(raw_audio_buffer) >= args.mels_convert_samples: break
    if len(raw_audio_buffer) < args.mels_convert_samples: return

    audio_to_convert = np.array(raw_audio_buffer[:args.mels_convert_samples]) / 32767
    raw_audio_buffer = raw_audio_buffer[args.mels_onestep_samples:]
    mels = audio_to_melspectrogram(args, audio_to_convert)

    X = []
    for i in range(args.rt_process_count):
        cur = int(i * args.list[1] / args.rt_oversamples)
        X.append(mels[:, cur:cur+args.list[1], np.newaxis])
    X = np.array(X)
    samplewise_normalize_audio_X(X)
    raw_preds = model.predict(X)
    for raw_pred in raw_preds:
        pred_queue.append(raw_pred)
        ensembled_pred = geometric_mean_preds(np.array([pred for pred in pred_queue]))
        on_predicted(ensembled_pred)

def load_graph(model_file):
    graph = tf.Graph()
    graph_def = tf.GraphDef()

    with open(model_file, "rb") as f:
        graph_def.ParseFromString(f.read())
    with graph.as_default():
        tf.import_graph_def(graph_def)
    return graph

def my_exit(model):
    model.close()
    exit(0)
class KerasTFGraph:
    def __init__(self, model_pb_filename, input_name,
                 keras_learning_phase_name, output_name):
        self.graph = load_graph(model_pb_filename)
        self.layer_in = self.graph.get_operation_by_name(input_name)
        self.leayer_klp = self.graph.get_operation_by_name(keras_learning_phase_name)
        self.layer_out = self.graph.get_operation_by_name(output_name)
        self.sess = tf.Session(graph=self.graph)
    def predict(self, X):
        preds = self.sess.run(self.layer_out.outputs[0],
                              {self.layer_in.outputs[0]: X,
                               self.leayer_klp.outputs[0]: 0})
        return preds
    def close(self):
        self.sess.close()
def get_model(graph_file):
    model_node = {
        'alexnet': ['import/conv2d_1_input',
                    'import/batch_normalization_1/keras_learning_phase',
                    'import/output0']

    }
    return KerasTFGraph(
        args.runtime_model_file if graph_file == '' else graph_file,
        input_name=model_node[args.model][0],
        keras_learning_phase_name=model_node[args.model][1],
        output_name=model_node[args.model][2])

def run_predictor():
    model = get_model(graph_file)

    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    audio = pyaudio.PyAudio()
    stream = audio.open(
                format=FORMAT,
                channels=CHANNELS,
                rate=args.rate,
                input=True,

                frames_per_buffer=1024,
                start=False,
                stream_callback=callback
            )

    stream.start_stream()
    while stream.is_active():
        main_process(model, on_predicted)
        time.sleep(1)
    stream.stop_stream()
    stream.close()

    audio.terminate()
    my_exit(model)


run_predictor()
岚DEMO

识别特定语音信号