基于语音识别的智能电子病历（三）之 Soniox

Soniox成立于2020年，目前总部位于美国加州福斯特城，该公司开发了市场上最好的语音识别引擎之一。该公司目前提供市面上领先的云转录引擎之一——这也是audioXpress成功用于采访和一般语音转文本转换的引擎。

专注于语音AI的Soniox在2021年推出了世界上第一个用于语音识别的无监督学习方法。这一创新对于克服之前阻碍语音系统性能的局限性至关重要。

2023年，Soniox开始从语音AI向通用AI转型。

我们是在2022年开始使用Soniox的。最初的时候Soniox的识别结果是这样的：

基于语音识别的智能电子病历（三）之 Soniox-LMLPHP

识别结果里除了每个单词的时间，识别质量，还有 Speaker，这个是用来标记，多人对话的语音中，说话者的不同。

2023年开始，Soniox 推出了基于语义分析的Section结果（俺还为他们提供了大量的训练数据）。主要是 SectionId、Title。

基于语音识别的智能电子病历（三）之 Soniox-LMLPHP

Soniox的使用也很简单，在C#的nuget中直接搜索 Soniox，缺点就是语义分析的服务比较贵。

syntax = "proto3";

package soniox.speech_service;
option csharp_namespace = "Soniox.Proto.SpeechService";

import "google/protobuf/timestamp.proto";

service SpeechService {
  // Synchronous transcription
  rpc Transcribe(TranscribeRequest) returns (TranscribeResponse) {}
  rpc TranscribeStream(stream TranscribeStreamRequest) returns (stream TranscribeStreamResponse) {}
  rpc TranscribeMeeting(stream TranscribeMeetingRequest) returns (stream TranscribeMeetingResponse) {}

  // Asynchronous transcription
  rpc TranscribeAsync(stream TranscribeAsyncRequest) returns (TranscribeAsyncResponse) {}
  rpc GetTranscribeAsyncStatus(GetTranscribeAsyncStatusRequest) returns (GetTranscribeAsyncStatusResponse) {}
  rpc GetTranscribeAsyncResult(GetTranscribeAsyncResultRequest) returns (stream GetTranscribeAsyncResultResponse) {}
  rpc DeleteTranscribeAsyncFile(DeleteTranscribeAsyncFileRequest) returns (DeleteTranscribeAsyncFileResponse) {}

  // Speech context
  rpc CreateSpeechContext(CreateSpeechContextRequest) returns (CreateSpeechContextResponse) {}
  rpc DeleteSpeechContext(DeleteSpeechContextRequest) returns (DeleteSpeechContextResponse) {}
  rpc ListSpeechContextNames(ListSpeechContextNamesRequest) returns (ListSpeechContextNamesResponse) {}
  rpc GetSpeechContext(GetSpeechContextRequest) returns (GetSpeechContextResponse) {}
  rpc UpdateSpeechContext(UpdateSpeechContextRequest) returns (UpdateSpeechContextResponse) {}

  // Speaker AI
  rpc AddSpeaker(AddSpeakerRequest) returns (AddSpeakerResponse) {}
  rpc GetSpeaker(GetSpeakerRequest) returns (GetSpeakerResponse) {}
  rpc RemoveSpeaker(RemoveSpeakerRequest) returns (RemoveSpeakerResponse) {}
  rpc ListSpeakers(ListSpeakersRequest) returns (ListSpeakersResponse) {}
  rpc AddSpeakerAudio(AddSpeakerAudioRequest) returns (AddSpeakerAudioResponse) {}
  rpc GetSpeakerAudio(GetSpeakerAudioRequest) returns (GetSpeakerAudioResponse) {}
  rpc RemoveSpeakerAudio(RemoveSpeakerAudioRequest) returns (RemoveSpeakerAudioResponse) {}
}

// Transcribe

message TranscribeRequest {
  string api_key = 1;
  TranscriptionConfig config = 4;
  bytes audio = 3;
}

message TranscribeResponse {
  Result result = 1;
  repeated Result channel_results = 2;
}

// TranscribeStream

message TranscribeStreamRequest {
  string api_key = 1;
  TranscriptionConfig config = 4;
  bytes audio = 3;
}

message TranscribeStreamResponse {
  Result result = 1;
}

// TranscribeMeeting

message TranscribeMeetingRequest {
  string api_key = 1;
  TranscriptionConfig config = 10;
  int32 seq_num = 3;
  int32 stream_id = 4;
  bool start_of_segment = 5;
  bytes audio = 6;
  bool end_of_segment = 7;
}

message TranscribeMeetingResponse {
  int32 seq_num = 1;
  int32 stream_id = 2;
  bool start_of_segment = 3;
  bool end_of_segment = 4;
  Result result = 5;
  string error = 6;
}

// TranscribeAsync

message TranscribeAsyncRequest {
  string api_key = 1;
  string reference_name = 3;
  TranscriptionConfig config = 5;
  bytes audio = 4;
}
message TranscribeAsyncResponse {
  string file_id = 1;
}

// GetTranscribeAsyncStatus

message GetTranscribeAsyncStatusRequest {
  string api_key = 1;
  string file_id = 2;
}
message GetTranscribeAsyncStatusResponse {
  repeated TranscribeAsyncFileStatus files = 1;
}
message TranscribeAsyncFileStatus {
  string file_id = 1;
  string reference_name = 2;
  // One of: QUEUED, TRANSCRIBING, COMPLETED, FAILED
  string status = 3;
  // UTC timestamp
  google.protobuf.Timestamp created_time = 4;
  string error_message = 5;
  string transcribe_async_mode = 6;
}

// GetTranscribeAsyncResult

message GetTranscribeAsyncResultRequest {
  string api_key = 1;
  string file_id = 2;
}
message GetTranscribeAsyncResultResponse {
  bool separate_recognition_per_channel = 2;
  Result result = 1;
}

// DeleteTranscribeAsyncFile

message DeleteTranscribeAsyncFileRequest {
  string api_key = 1;
  string file_id = 2;
}
message DeleteTranscribeAsyncFileResponse {
}

// Common

message TranscriptionConfig {
  // Input options
  string audio_format = 1;
  int32 sample_rate_hertz = 2;
  int32 num_audio_channels = 3;

  // Output options
  bool include_nonfinal = 4;
  bool enable_separate_recognition_per_channel = 16;

  // Speech adaptation
  SpeechContext speech_context = 5;

  // Content moderation
  bool enable_profanity_filter = 6;
  repeated string content_moderation_phrases = 7;

  // Speaker diarization
  bool enable_streaming_speaker_diarization = 8;
  bool enable_global_speaker_diarization = 9;
  int32 min_num_speakers = 10;
  int32 max_num_speakers = 11;

  // Speaker identification
  bool enable_speaker_identification = 12;
  repeated string cand_speaker_names = 13;

  // Model options
  string model = 14;
  bool enable_dictation = 15;

  // Asynchronous transcription
  string transcribe_async_mode = 17;
}

message Result {
  repeated Word words = 1;
  int32 final_proc_time_ms = 2;
  int32 total_proc_time_ms = 3;
  repeated ResultSpeaker speakers = 6;
  int32 channel = 7;
}

message Word {
  string text = 1;
  int32 start_ms = 2;
  int32 duration_ms = 3;
  bool is_final = 4;
  int32 speaker = 5;
  string orig_text = 8;
  double confidence = 9;
}

message ResultSpeaker {
  int32 speaker = 1;
  string name = 2;
}

// SpeechContext

message SpeechContext {
  repeated SpeechContextEntry entries = 1;
  string name = 2;
}

message SpeechContextEntry {
  repeated string phrases = 1;
  double boost = 2;
}

message CreateSpeechContextRequest {
  string api_key = 1;
  SpeechContext speech_context = 2;
}
message CreateSpeechContextResponse {
}

message DeleteSpeechContextRequest {
  string api_key = 1;
  string name = 2;
}
message DeleteSpeechContextResponse {
}

message ListSpeechContextNamesRequest {
  string api_key = 1;
}
message ListSpeechContextNamesResponse {
  repeated string names = 1;
}

message GetSpeechContextRequest {
  string api_key = 1;
  string name = 2;
}
message GetSpeechContextResponse {
  SpeechContext speech_context = 1;
}

message UpdateSpeechContextRequest {
  string api_key = 1;
  SpeechContext speech_context = 2;
}
message UpdateSpeechContextResponse {
}

// Speaker AI

// AddSpeaker

message AddSpeakerRequest {
  string api_key = 1;
  string name = 2;
}

message AddSpeakerResponse {
  string name = 1;
  google.protobuf.Timestamp created = 2;
}

// GetSpeaker

message GetSpeakerRequest {
  string api_key = 1;
  string name = 2;
}

message GetSpeakerResponse {
  string name = 1;
  google.protobuf.Timestamp created = 2;
  repeated GetSpeakerResponseAudio audios = 3;
}

message GetSpeakerResponseAudio {
  string audio_name = 1;
  google.protobuf.Timestamp created = 2;
  int32 duration_ms = 3;
}

// RemoveSpeaker

message RemoveSpeakerRequest {
  string api_key = 1;
  string name = 2;
}

message RemoveSpeakerResponse {}

// ListSpeakers

message ListSpeakersRequest {
  string api_key = 1;
}

message ListSpeakersResponse {
  repeated ListSpeakersResponseSpeaker speakers = 1;
}

message ListSpeakersResponseSpeaker {
  string name = 1;
  google.protobuf.Timestamp created = 2;
  int32 num_audios = 3;
}

// AddSpeakerAudio

message AddSpeakerAudioRequest {
  string api_key = 1;
  string speaker_name = 2;
  string audio_name = 3;
  bytes audio = 4;
}

message AddSpeakerAudioResponse {
  string speaker_name = 1;
  string audio_name = 2;
  google.protobuf.Timestamp created = 3;
  int32 duration_ms = 4;
}

// GetSpeakerAudio

message GetSpeakerAudioRequest {
  string api_key = 1;
  string speaker_name = 2;
  string audio_name = 3;
}

message GetSpeakerAudioResponse {
  string speaker_name = 1;
  string audio_name = 2;
  google.protobuf.Timestamp created = 3;
  int32 duration_ms = 4;
  bytes audio = 5;
}

// RemoveSpeakerAudio

message RemoveSpeakerAudioRequest {
  string api_key = 1;
  string speaker_name = 2;
  string audio_name = 3;
}

message RemoveSpeakerAudioResponse {}

月巴月巴白勺合鸟月半

基于语音识别的智能电子病历（三）之 Soniox