transformers_js_py
from transformers_js_py import import_transformers_js
import gradio as gr
import numpy as np
transformers_js = await import_transformers_js("3.0.2")
pipeline = transformers_js.pipeline
synthesizer = await pipeline(
    'text-to-speech',
    'Xenova/speecht5_tts',
    { "quantized": False }
)
speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
async def synthesize(text):
    out = await synthesizer(text, { "speaker_embeddings": speaker_embeddings });
    audio_data_memory_view = out["audio"]
    sampling_rate = out["sampling_rate"]
    audio_data = np.frombuffer(audio_data_memory_view, dtype=np.float32)
    audio_data_16bit = (audio_data * 32767).astype(np.int16)
    return sampling_rate, audio_data_16bit
demo = gr.Interface(synthesize, "textbox", "audio")
demo.launch()