huge commit. assistant code is now semifunctional

2023-12-01 21:48:56 +01:00
parent af6f2a13b7
commit f18e41476f
7 changed files with 501 additions and 168 deletions
--- a/frontend/tts_stream.py
+++ b/frontend/tts_stream.py
@@ -1,5 +1,4 @@
 import os
-import torch
 import pyaudio
 from TTS.api import TTS
 from TTS.tts.configs.xtts_config import XttsConfig
@@ -7,27 +6,35 @@ from TTS.tts.models.xtts import Xtts
 from TTS.utils.generic_utils import get_user_data_dir
 import threading
 import time
+import re
+

-# Check if CUDA is available
-if torch.cuda.is_available():
-    print("Using CUDA")
-    device = "cuda"
-else:
-    print("Using CPU")
-    device = "cpu"

 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"


-class TTSstream:
-    def __init__(self, speaker_wav):
+class TTSStream:
+    def __init__(self, speaker_wav=None, device=None):
        model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))

+        if device is None:
+            import torch
+
+            # Check if CUDA is available
+            if torch.cuda.is_available():
+                print("Using CUDA")
+                device = "cuda"
+            else:
+                print("Using CPU")
+                device = "cpu"
+
        #print(model_path)
+
+        print("Loading TTS model... ", end="")
 #
        # download model if it doesn't exist
        if not os.path.exists(os.path.join(model_path, "config.json")):
-            print("Downloading model...")
+            print("Downloading model... ", end="")
            tts = TTS()
            tts.download_model_by_name(model_name=model_name)

@@ -43,11 +50,16 @@ class TTSstream:
        )
        self.model.to(device)

+        print("Done!")

-        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
+        if speaker_wav is not None:
+            #self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
+            self.change_speaker(speaker_wav)

    def change_speaker(self, speaker_wav):
+        print("Loading speaker... ", end="")
        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
+        print("Done!")

    def _write_stream(self):
        # play first play_buffer_size samples and remove them from the buffer
@@ -58,6 +70,7 @@ class TTSstream:
                self.stream.write(self.chunk)
            else:
                if self.all_done:
+                    #self.thread_ended = True
                    break
                time.sleep(0.01)

@@ -65,42 +78,60 @@ class TTSstream:
    def tts_speak(self, text):
        self.play_buffer_size = 512

-        chunks = self.model.inference_stream(
-            text,
-            "pl",
-            self.gpt_cond_latent,
-            self.speaker_embedding,
-            stream_chunk_size=20,
-        )
-

        # open pyaudio stream
        p = pyaudio.PyAudio()
        self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True)

+        # for each sentence ending with . or ! or ?
+        for text in re.split(r"(?<=[.!?])", text):
+            text = text.strip()

-        self.chunks_bin = b""
-        self.all_done = False
-
-        # run write_stream as thread
-        thread = threading.Thread(target=self._write_stream)
-        thread.start()
-
-        while True:
-            try:
-                # read chunks from chunks generator as they are generated
-                for self.chunk in chunks:
-                    self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
-                break
-            # some weird error caused by coqui-tts
-            except:
-                print("Error occured when generating audio stream. Retrying...")
+            if len(text) == 0:
                continue

-        self.all_done = True
+            chunks = self.model.inference_stream(
+                text,
+                "pl",
+                self.gpt_cond_latent,
+                self.speaker_embedding,
+                stream_chunk_size=20,
+            )

-        # wait for thread to finish
-        thread.join()
+
+            self.chunks_bin = b""
+            self.all_done = False
+
+            # run write_stream as thread
+            #self.thread_ended = False
+            thread = threading.Thread(target=self._write_stream)
+            thread.start()
+
+            while True:
+                try:
+                    # read chunks from chunks generator as they are generated
+                    for self.chunk in chunks:
+                        self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
+                    break
+                # some weird error caused by coqui-tts
+                except:
+                    print("Error occured when generating audio stream. Retrying...")
+                    continue
+
+            self.all_done = True
+
+            # wait for thread to finish
+            thread.join()
+
+            # wait for thread ended
+            #while not self.thread_ended:
+            #    time.sleep(0.01)
+            
+            #while True:
+            #    if self.thread_ended:
+            #        break
+            #    print("Waiting for thread to end...")
+            #    time.sleep(0.01)

        self.stream.close()
        p.terminate()