Soniox成立于2020年,目前总部位于美国加州福斯特城,该公司开发了市场上最好的语音识别引擎之一。该公司目前提供市面上领先的云转录引擎之一——这也是audioXpress成功用于采访和一般语音转文本转换的引擎。
专注于语音AI的Soniox在2021年推出了世界上第一个用于语音识别的无监督学习方法。这一创新对于克服之前阻碍语音系统性能的局限性至关重要。
2023年,Soniox开始从语音AI向通用AI转型。
我们是在2022年开始使用Soniox的。最初的时候Soniox的识别结果是这样的:
识别结果里除了 每个单词的时间,识别质量,还有 Speaker,这个是用来标记,多人对话的语音中,说话者的不同。
2023年开始,Soniox 推出了基于语义分析的Section结果(俺还为他们提供了大量的训练数据)。主要是 SectionId、Title。
Soniox的使用也很简单,在C#的nuget中直接搜索 Soniox,缺点就是 语义分析 的服务比较贵。
syntax = "proto3";
package soniox.speech_service;
option csharp_namespace = "Soniox.Proto.SpeechService";
import "google/protobuf/timestamp.proto";
service SpeechService {
// Synchronous transcription
rpc Transcribe(TranscribeRequest) returns (TranscribeResponse) {}
rpc TranscribeStream(stream TranscribeStreamRequest) returns (stream TranscribeStreamResponse) {}
rpc TranscribeMeeting(stream TranscribeMeetingRequest) returns (stream TranscribeMeetingResponse) {}
// Asynchronous transcription
rpc TranscribeAsync(stream TranscribeAsyncRequest) returns (TranscribeAsyncResponse) {}
rpc GetTranscribeAsyncStatus(GetTranscribeAsyncStatusRequest) returns (GetTranscribeAsyncStatusResponse) {}
rpc GetTranscribeAsyncResult(GetTranscribeAsyncResultRequest) returns (stream GetTranscribeAsyncResultResponse) {}
rpc DeleteTranscribeAsyncFile(DeleteTranscribeAsyncFileRequest) returns (DeleteTranscribeAsyncFileResponse) {}
// Speech context
rpc CreateSpeechContext(CreateSpeechContextRequest) returns (CreateSpeechContextResponse) {}
rpc DeleteSpeechContext(DeleteSpeechContextRequest) returns (DeleteSpeechContextResponse) {}
rpc ListSpeechContextNames(ListSpeechContextNamesRequest) returns (ListSpeechContextNamesResponse) {}
rpc GetSpeechContext(GetSpeechContextRequest) returns (GetSpeechContextResponse) {}
rpc UpdateSpeechContext(UpdateSpeechContextRequest) returns (UpdateSpeechContextResponse) {}
// Speaker AI
rpc AddSpeaker(AddSpeakerRequest) returns (AddSpeakerResponse) {}
rpc GetSpeaker(GetSpeakerRequest) returns (GetSpeakerResponse) {}
rpc RemoveSpeaker(RemoveSpeakerRequest) returns (RemoveSpeakerResponse) {}
rpc ListSpeakers(ListSpeakersRequest) returns (ListSpeakersResponse) {}
rpc AddSpeakerAudio(AddSpeakerAudioRequest) returns (AddSpeakerAudioResponse) {}
rpc GetSpeakerAudio(GetSpeakerAudioRequest) returns (GetSpeakerAudioResponse) {}
rpc RemoveSpeakerAudio(RemoveSpeakerAudioRequest) returns (RemoveSpeakerAudioResponse) {}
}
// Transcribe
message TranscribeRequest {
string api_key = 1;
TranscriptionConfig config = 4;
bytes audio = 3;
}
message TranscribeResponse {
Result result = 1;
repeated Result channel_results = 2;
}
// TranscribeStream
message TranscribeStreamRequest {
string api_key = 1;
TranscriptionConfig config = 4;
bytes audio = 3;
}
message TranscribeStreamResponse {
Result result = 1;
}
// TranscribeMeeting
message TranscribeMeetingRequest {
string api_key = 1;
TranscriptionConfig config = 10;
int32 seq_num = 3;
int32 stream_id = 4;
bool start_of_segment = 5;
bytes audio = 6;
bool end_of_segment = 7;
}
message TranscribeMeetingResponse {
int32 seq_num = 1;
int32 stream_id = 2;
bool start_of_segment = 3;
bool end_of_segment = 4;
Result result = 5;
string error = 6;
}
// TranscribeAsync
message TranscribeAsyncRequest {
string api_key = 1;
string reference_name = 3;
TranscriptionConfig config = 5;
bytes audio = 4;
}
message TranscribeAsyncResponse {
string file_id = 1;
}
// GetTranscribeAsyncStatus
message GetTranscribeAsyncStatusRequest {
string api_key = 1;
string file_id = 2;
}
message GetTranscribeAsyncStatusResponse {
repeated TranscribeAsyncFileStatus files = 1;
}
message TranscribeAsyncFileStatus {
string file_id = 1;
string reference_name = 2;
// One of: QUEUED, TRANSCRIBING, COMPLETED, FAILED
string status = 3;
// UTC timestamp
google.protobuf.Timestamp created_time = 4;
string error_message = 5;
string transcribe_async_mode = 6;
}
// GetTranscribeAsyncResult
message GetTranscribeAsyncResultRequest {
string api_key = 1;
string file_id = 2;
}
message GetTranscribeAsyncResultResponse {
bool separate_recognition_per_channel = 2;
Result result = 1;
}
// DeleteTranscribeAsyncFile
message DeleteTranscribeAsyncFileRequest {
string api_key = 1;
string file_id = 2;
}
message DeleteTranscribeAsyncFileResponse {
}
// Common
message TranscriptionConfig {
// Input options
string audio_format = 1;
int32 sample_rate_hertz = 2;
int32 num_audio_channels = 3;
// Output options
bool include_nonfinal = 4;
bool enable_separate_recognition_per_channel = 16;
// Speech adaptation
SpeechContext speech_context = 5;
// Content moderation
bool enable_profanity_filter = 6;
repeated string content_moderation_phrases = 7;
// Speaker diarization
bool enable_streaming_speaker_diarization = 8;
bool enable_global_speaker_diarization = 9;
int32 min_num_speakers = 10;
int32 max_num_speakers = 11;
// Speaker identification
bool enable_speaker_identification = 12;
repeated string cand_speaker_names = 13;
// Model options
string model = 14;
bool enable_dictation = 15;
// Asynchronous transcription
string transcribe_async_mode = 17;
}
message Result {
repeated Word words = 1;
int32 final_proc_time_ms = 2;
int32 total_proc_time_ms = 3;
repeated ResultSpeaker speakers = 6;
int32 channel = 7;
}
message Word {
string text = 1;
int32 start_ms = 2;
int32 duration_ms = 3;
bool is_final = 4;
int32 speaker = 5;
string orig_text = 8;
double confidence = 9;
}
message ResultSpeaker {
int32 speaker = 1;
string name = 2;
}
// SpeechContext
message SpeechContext {
repeated SpeechContextEntry entries = 1;
string name = 2;
}
message SpeechContextEntry {
repeated string phrases = 1;
double boost = 2;
}
message CreateSpeechContextRequest {
string api_key = 1;
SpeechContext speech_context = 2;
}
message CreateSpeechContextResponse {
}
message DeleteSpeechContextRequest {
string api_key = 1;
string name = 2;
}
message DeleteSpeechContextResponse {
}
message ListSpeechContextNamesRequest {
string api_key = 1;
}
message ListSpeechContextNamesResponse {
repeated string names = 1;
}
message GetSpeechContextRequest {
string api_key = 1;
string name = 2;
}
message GetSpeechContextResponse {
SpeechContext speech_context = 1;
}
message UpdateSpeechContextRequest {
string api_key = 1;
SpeechContext speech_context = 2;
}
message UpdateSpeechContextResponse {
}
// Speaker AI
// AddSpeaker
message AddSpeakerRequest {
string api_key = 1;
string name = 2;
}
message AddSpeakerResponse {
string name = 1;
google.protobuf.Timestamp created = 2;
}
// GetSpeaker
message GetSpeakerRequest {
string api_key = 1;
string name = 2;
}
message GetSpeakerResponse {
string name = 1;
google.protobuf.Timestamp created = 2;
repeated GetSpeakerResponseAudio audios = 3;
}
message GetSpeakerResponseAudio {
string audio_name = 1;
google.protobuf.Timestamp created = 2;
int32 duration_ms = 3;
}
// RemoveSpeaker
message RemoveSpeakerRequest {
string api_key = 1;
string name = 2;
}
message RemoveSpeakerResponse {}
// ListSpeakers
message ListSpeakersRequest {
string api_key = 1;
}
message ListSpeakersResponse {
repeated ListSpeakersResponseSpeaker speakers = 1;
}
message ListSpeakersResponseSpeaker {
string name = 1;
google.protobuf.Timestamp created = 2;
int32 num_audios = 3;
}
// AddSpeakerAudio
message AddSpeakerAudioRequest {
string api_key = 1;
string speaker_name = 2;
string audio_name = 3;
bytes audio = 4;
}
message AddSpeakerAudioResponse {
string speaker_name = 1;
string audio_name = 2;
google.protobuf.Timestamp created = 3;
int32 duration_ms = 4;
}
// GetSpeakerAudio
message GetSpeakerAudioRequest {
string api_key = 1;
string speaker_name = 2;
string audio_name = 3;
}
message GetSpeakerAudioResponse {
string speaker_name = 1;
string audio_name = 2;
google.protobuf.Timestamp created = 3;
int32 duration_ms = 4;
bytes audio = 5;
}
// RemoveSpeakerAudio
message RemoveSpeakerAudioRequest {
string api_key = 1;
string speaker_name = 2;
string audio_name = 3;
}
message RemoveSpeakerAudioResponse {}