音声ファイルをテキスト化する Microsoft Bing Speech to Text の試用をしてみました。STT にはクライアントライブラリとREST API が提供されています。お手軽な REST API の方を試してみます。シェルスクリプトのサンプルを元にPHPで書いてみました。
<?php // // Microsoft Bing API Speech to Text // define("BING_TOKEN_URL", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken"); define("BING_BASE_URL", "https://speech.platform.bing.com/speech/recognition"); define("BING_SERVICE", "/cognitiveservices"); define("BING_VERSION", "/v1"); define("BING_SUBSCRIPTION_KEY", "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"); define("BING_LANG", "ja-JP"); define("BING_LOCALE", "ja-JP"); define("AUDIO_TYPE", "audio/wav"); /* * Recognition modes: * interactive: a user makes short requests and expects the application * to perform an action in response. * conversation: users are engaged in a human-to-human conversation. * * dictation: users recite longer utterances to the application * for further processing. */ define("RECOGNITION_MODE", "conversation"); /* * Output format: * simple: A simplified phrase result containing the recognition status * and the recognized text in display form. * detailed: A recognition status and N-best list of phrase results * where each phrase result contains all four recognition forms * and a confidence score. */ define("OUTPUT_FORMAT", "simple"); function Get_Token($url, $subscriptionKey) { $ch = curl_init(); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HTTPHEADER, array( "Content-type: application/x-www-form-urlencoded", "Content-Length: 0", "Ocp-Apim-Subscription-Key: {$subscriptionKey}" )); $token = @curl_exec($ch); return $token; } function Speech_to_Text($url, $token, $audioFile, $audioType) { $size = filesize($audioFile); $data = file_get_contents($audioFile); $ch = curl_init(); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HTTPHEADER, array( "Transfer-Encoding: chunked", "Content-Type: {$audioType}; codec=\"audio/pcm\"; samplerate=16000", "Authorization: Bearer {$token}" )); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); curl_setopt($ch, CURLOPT_INFILESIZE, $size); $res = @curl_exec($ch); return $res; } // Main if (empty($argv[1])) { echo "Please, specify a file to transcribe.".PHP_EOL; exit; } $filename = trim($argv[1]); if (!file_exists($filename)) { echo "The file specified doesn't exist.".PHP_EOL; exit; } $token = Get_Token(BING_TOKEN_URL, BING_SUBSCRIPTION_KEY); if (!empty($token)) { $url = BING_BASE_URL."/".RECOGNITION_MODE; $url .= BING_SERVICE.BING_VERSION; $url .= "?language=".BING_LANG; $url .= "&locale=".BING_LOCALE; $url .= "&format=".OUTPUT_FORMAT; $url .= "&requestid=rest_sample_request_id"; $res = Speech_to_Text($url, $token, $filename, AUDIO_TYPE); var_dump($res); } else { echo "Failed to get token.".PHP_EOL; } ?>
Bing Speech to Text REST API では 15秒以下という制限があるため、音声の始まりの部分しかテキスト化できませんが、ちゃんと日本語テキストとして出力してくれています。いくつか試してみました。
出力結果:
outbound_only.wav:「お電話ありがとうございます。発信専用ダイアルのためおつなぎすることが出来ません・・・」 # php sample.php outbound_only.wav string(125) "{"RecognitionStatus":"Success","DisplayText":"お電話ありがとうございます","Offset":19100000,"Duration":19100000}"
busy.wav: 「ただいま電話が大変混み合っております。・・・」 # php sample.php busy.wav string(141) "{"RecognitionStatus":"Success","DisplayText":"ただいま電話 ga 大変混み合っております","Offset":12500000,"Duration":34500000}"
outoofservice.wav: 「ただいまのお時間は受付を終了しております。・・・」 # php sample.php outofservice.wav string(146) "{"RecognitionStatus":"Success","DisplayText":"ただいまのお時間は受付を終了しております","Offset":16600000,"Duration":40700000}"