Tuesday, 16 March 2021

TTS: How to convert text into SSML?

My goal is to let the device speaking a text with a human voice. So I am using the Text-to-Speech API from Google.

This is how my code looks like:

package ch.yourclick.kitt;

import android.media.MediaPlayer;
import android.os.Build;
import android.os.Bundle;
import android.os.StrictMode;
import android.view.View;

import androidx.annotation.RequiresApi;
import androidx.appcompat.app.AppCompatActivity;
import androidx.viewpager.widget.ViewPager;

import com.google.android.material.floatingactionbutton.FloatingActionButton;
import com.google.android.material.snackbar.Snackbar;
import com.google.android.material.tabs.TabLayout;
import com.google.api.gax.core.FixedCredentialsProvider;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.texttospeech.v1.AudioConfig;
import com.google.cloud.texttospeech.v1.AudioEncoding;
import com.google.cloud.texttospeech.v1.SsmlVoiceGender;
import com.google.cloud.texttospeech.v1.SynthesisInput;
import com.google.cloud.texttospeech.v1.SynthesizeSpeechResponse;
import com.google.cloud.texttospeech.v1.TextToSpeechClient;
import com.google.cloud.texttospeech.v1.TextToSpeechSettings;
import com.google.cloud.texttospeech.v1.VoiceSelectionParams;
import com.google.common.html.HtmlEscapers;
import com.google.protobuf.ByteString;

import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;

import ch.yourclick.kitt.ui.main.SectionsPagerAdapter;

public class MainActivity extends AppCompatActivity implements View.OnClickListener {

    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);
        SectionsPagerAdapter sectionsPagerAdapter = new SectionsPagerAdapter(this, getSupportFragmentManager());
        ViewPager viewPager = findViewById(R.id.view_pager);
        viewPager.setAdapter(sectionsPagerAdapter);
        TabLayout tabs = findViewById(R.id.tabs);
        tabs.setupWithViewPager(viewPager);
        FloatingActionButton fab = findViewById(R.id.fab);

        fab.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View view) {
                Snackbar.make(view, "Replace with your own action", Snackbar.LENGTH_LONG)
                        .setAction("Action", null).show();
            }
        });
    }


    @RequiresApi(api = Build.VERSION_CODES.LOLLIPOP)
    @Override
    public void onClick(View view) {
        int SDK_INT = android.os.Build.VERSION.SDK_INT;
        if (SDK_INT > 8)
        {
            StrictMode.ThreadPolicy policy = new StrictMode.ThreadPolicy.Builder()
                    .permitAll().build();
            StrictMode.setThreadPolicy(policy);

            try {
                this.hello();
            } catch (Exception e) {
                e.printStackTrace();
            }

        }
    }

    /** Demonstrates using the Text-to-Speech API. */
    @RequiresApi(api = Build.VERSION_CODES.KITKAT)
    public void hello() throws Exception {
        InputStream stream = getResources().openRawResource(R.raw.credential); // R.raw.credential is credential.json
        GoogleCredentials credentials = GoogleCredentials.fromStream(stream);
        TextToSpeechSettings textToSpeechSettings =
                TextToSpeechSettings.newBuilder()
                        .setCredentialsProvider(
                                FixedCredentialsProvider.create(credentials)
                        ).build()
                ;

        // Instantiates a client
        try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create(textToSpeechSettings)) {

            // Set the text input to be synthesized
            SynthesisInput input = SynthesisInput.newBuilder().setText("<speak>Step 1, take a deep breath. <break time=\"2000ms\"/> Hello?</speak>").build();

            // Build the voice request, select the language code ("en-US") and the ssml voice gender
            // ("neutral")
            VoiceSelectionParams voice =
                    VoiceSelectionParams.newBuilder()
                            .setLanguageCode("en-US")
                            .setSsmlGender(SsmlVoiceGender.NEUTRAL)
                            .build();

            // Select the type of audio file you want returned
            AudioConfig audioConfig =
                    AudioConfig.newBuilder().setAudioEncoding(AudioEncoding.MP3).build();

            // Perform the text-to-speech request on the text input with the selected voice parameters and
            // audio file type
            SynthesizeSpeechResponse response = textToSpeechClient.synthesizeSpeech(input, voice, audioConfig);

            // Get the audio contents from the response
            ByteString audioContents = response.getAudioContent();

            // Write the response to the output file.
            try (FileOutputStream out = new FileOutputStream(getFilesDir() + "/output.mp3")) {
                System.out.println(getFilesDir());
                out.write(audioContents.toByteArray());
                System.out.println("Audio content written to file \"output.mp3\"");
            }

            String myFile = getFilesDir() + "/output.mp3";
            MediaPlayer mediaPlayer = new MediaPlayer();
            mediaPlayer.setDataSource(myFile);
            mediaPlayer.prepare();
            mediaPlayer.start();
        }
    }

}

As you see in the code, the text should be "Step 1, take a deep breath. Step 2 ... hello? Are you there?"

Well, I get audio but it doesn't sound natural and it starts with saying "Less than speak ...", which is not the point.

It is probably not working, because I will need to convert that plaintext into SSML. But well, how can I do this?

I am using Android Studio.


Update

The following method should work fine:

public static String textToSsml(String inputFile) throws Exception {

  // Read lines of input file
  String rawLines = new String(Files.readAllBytes(Paths.get(inputFile)));

  // Replace special characters with HTML Ampersand Character Codes
  // These codes prevent the API from confusing text with SSML tags
  // For example, '<' --> '&lt;' and '&' --> '&amp;'
  String escapedLines = HtmlEscapers.htmlEscaper().escape(rawLines);

  // Convert plaintext to SSML
  // Tag SSML so that there is a 2 second pause between each address
  String expandedNewline = escapedLines.replaceAll("\\n", "\n<break time='2s'/>");
  String ssml = "<speak>" + expandedNewline + "</speak>";

  // Return the concatenated String of SSML
  return ssml;
}

Reference: https://cloud.google.com/text-to-speech/docs/ssml-tutorial?hl=en#personalizing_synthetic_audio

I have still no idea how to use this method. But this what I have tried:

package ch.yourclick.kitt;

import android.media.MediaPlayer;
import android.os.Build;
import android.os.Bundle;
import android.os.StrictMode;
import android.view.View;

import androidx.annotation.RequiresApi;
import androidx.appcompat.app.AppCompatActivity;
import androidx.viewpager.widget.ViewPager;

import com.google.android.material.floatingactionbutton.FloatingActionButton;
import com.google.android.material.snackbar.Snackbar;
import com.google.android.material.tabs.TabLayout;
import com.google.api.gax.core.FixedCredentialsProvider;
import com.google.auth.oauth2.GoogleCredentials;
import com.google.cloud.texttospeech.v1.AudioConfig;
import com.google.cloud.texttospeech.v1.AudioEncoding;
import com.google.cloud.texttospeech.v1.SsmlVoiceGender;
import com.google.cloud.texttospeech.v1.SynthesisInput;
import com.google.cloud.texttospeech.v1.SynthesizeSpeechResponse;
import com.google.cloud.texttospeech.v1.TextToSpeechClient;
import com.google.cloud.texttospeech.v1.TextToSpeechSettings;
import com.google.cloud.texttospeech.v1.VoiceSelectionParams;
import com.google.common.html.HtmlEscapers;
import com.google.protobuf.ByteString;

import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;

import ch.yourclick.kitt.ui.main.SectionsPagerAdapter;

public class MainActivity extends AppCompatActivity implements View.OnClickListener {

    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);
        SectionsPagerAdapter sectionsPagerAdapter = new SectionsPagerAdapter(this, getSupportFragmentManager());
        ViewPager viewPager = findViewById(R.id.view_pager);
        viewPager.setAdapter(sectionsPagerAdapter);
        TabLayout tabs = findViewById(R.id.tabs);
        tabs.setupWithViewPager(viewPager);
        FloatingActionButton fab = findViewById(R.id.fab);

        fab.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View view) {
                Snackbar.make(view, "Replace with your own action", Snackbar.LENGTH_LONG)
                        .setAction("Action", null).show();
            }
        });
    }


    @RequiresApi(api = Build.VERSION_CODES.LOLLIPOP)
    @Override
    public void onClick(View view) {
        int SDK_INT = android.os.Build.VERSION.SDK_INT;
        if (SDK_INT > 8)
        {
            StrictMode.ThreadPolicy policy = new StrictMode.ThreadPolicy.Builder()
                    .permitAll().build();
            StrictMode.setThreadPolicy(policy);

            try {
                this.hello();
            } catch (Exception e) {
                e.printStackTrace();
            }

        }
    }

    /** Demonstrates using the Text-to-Speech API. */
    @RequiresApi(api = Build.VERSION_CODES.KITKAT)
    public void hello() throws Exception {
        InputStream stream = getResources().openRawResource(R.raw.credential); // R.raw.credential is credential.json
        GoogleCredentials credentials = GoogleCredentials.fromStream(stream);
        TextToSpeechSettings textToSpeechSettings =
                TextToSpeechSettings.newBuilder()
                        .setCredentialsProvider(
                                FixedCredentialsProvider.create(credentials)
                        ).build()
                ;

        // Instantiates a client
        try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create(textToSpeechSettings)) {

            // Set the text input to be synthesized
            SynthesisInput input = SynthesisInput.newBuilder().setText("Step 1 \n take a deep breath").build();

            // Build the voice request, select the language code ("en-US") and the ssml voice gender
            // ("neutral")
            VoiceSelectionParams voice =
                    VoiceSelectionParams.newBuilder()
                            .setLanguageCode("en-US")
                            .setSsmlGender(SsmlVoiceGender.NEUTRAL)
                            .build();

            // Select the type of audio file you want returned
            AudioConfig audioConfig =
                    AudioConfig.newBuilder().setAudioEncoding(AudioEncoding.MP3).build();

            // Perform the text-to-speech request on the text input with the selected voice parameters and
            // audio file type
            SynthesizeSpeechResponse response = textToSpeechClient.synthesizeSpeech(input, voice, audioConfig);

            // Get the audio contents from the response
            ByteString audioContents = response.getAudioContent();


            // Write the response to the output file.
            try (FileOutputStream out = new FileOutputStream(getFilesDir() + "/output.mp3")) {
                System.out.println(getFilesDir());
                out.write(audioContents.toByteArray());
                System.out.println("Audio content written to file \"output.mp3\"");
            }

            String myFile = getFilesDir() + "/output.mp3";
            MediaPlayer mediaPlayer = new MediaPlayer();
            mediaPlayer.setDataSource(myFile);
            mediaPlayer.prepare();
            mediaPlayer.start();

            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
                textToSsml(getFilesDir() + "/output.mp3");
            }
        }
    }

    @RequiresApi(api = Build.VERSION_CODES.O)
    public static String textToSsml(String inputFile) throws Exception {

        // Read lines of input file
        String rawLines = new String(Files.readAllBytes(Paths.get(inputFile)));

        // Replace special characters with HTML Ampersand Character Codes
        // These codes prevent the API from confusing text with SSML tags
        // For example, '<' --> '&lt;' and '&' --> '&amp;'
        String escapedLines = HtmlEscapers.htmlEscaper().escape(rawLines);

        // Convert plaintext to SSML
        // Tag SSML so that there is a 2 second pause between each address
        String expandedNewline = escapedLines.replaceAll("\\n", "\n<break time='2s'/>");
        String ssml = "<speak>" + expandedNewline + "</speak>";

        // Return the concatenated String of SSML
        return ssml;
    }

}

Well, the goal is that the audio will be: "Step 1" (wait for 2 seconds) "take a deep breath" But in my case the output is "Step 1 take a deep breath", so the pause of 2 seconds is missing. What am I doing wrong?



from TTS: How to convert text into SSML?

No comments:

Post a Comment