AAAAAA

Added UDP raw audio stream player script
removed unused and unnecesary functionality
2023-12-06 01:19:49 +01:00 · 2023-12-05 01:40:59 +01:00 · 2023-12-04 23:36:07 +01:00 · 2023-12-04 23:07:43 +01:00 · 2023-12-04 14:35:59 +01:00 · 2023-12-04 14:35:35 +01:00
13 changed files with 965 additions and 183 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,10 @@
 finetune_dialogs_tool/temp/
 finetune_dialogs_tool/output_dialogs/
-finetune_dialogs_tool/__pycache__/

 frontend/voices/*
 !frontend/voices/lector.wav
 !frontend/voices/lector source.txt

-frontend/__pycache__/
+frontend/dupa.py
+
+*/__pycache__/
--- a/frontend/app.py
+++ b/frontend/app.py
@@ -1,21 +1,80 @@
 import gradio as gr
+from core import Core
 import os

+def start_assistant(phone_number, order_items, delivery_address, payment_method):
+    global assistant
+    assistant.set_order_settings(
+        phone_number, 
+        order_items, 
+        delivery_address,
+        payment_method
+    )
+    assistant.assistant_start()

-def proceed():
-    print("ok")
+
+def stop_assistant():
+    assistant.assistant_stop()
+
+
+def set_advanced_settings(speech_recog_timeout, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms):
+    assistant.set_speech_recog_settings(
+        speech_recog_timeout, 
+        window_size_sec, 
+        vad_threshold, 
+        min_silence_duration_ms, 
+        speech_pad_ms
+    )
+
+def set_voice_wav(speaker_wav):
+    speaker_wav = os.path.join(cwd, "voices", speaker_wav)
+    assistant.set_tts_settings(speaker_wav)
+
+
+assistant = Core(
+    use_chatgpt_placeholder = True
+)
+
+cwd = os.path.dirname(os.path.realpath(__file__))


 with gr.Blocks() as demo:
-    txt_2 = gr.Textbox(label="Podaj Twój nr telefonu", lines=1)
-    txt_2 = gr.Textbox(label="Co chcesz zamówić?", lines=2)
-    txt_3 = gr.Textbox(label="Na jaki adres?")
-    txt_4 = gr.Textbox(label="Dodatkowe informacje", lines=2)
-    gr.Dropdown(["Wejściowe", "Wyjściowe"], label="Urządzenie", info="Wybierz urządzenie audio!"),
-    gr.Radio(["inteigentna osoba", "50/50", "głupek"], label="Jaki rodzaj osoby udawać"),
-    gr.Radio(["hitler", "stuu", "lektor","belmondawg","sasza", "villager"], label="Głos", info="Jakiego głosu użyć?"),
-    btn = gr.Button(value="Submit")
-    btn.click(proceed)
+    with gr.Tab("Basic Settings"):
+        with gr.Row():
+            phone_number = gr.Textbox(label="Twój Nr. Telefonu")
+            order_items = gr.Textbox(label="Zamówienie", lines=5)
+            delivery_address = gr.Textbox(label="Adres dostawy")
+            payment_method = gr.Dropdown(label="Metoda płatności", choices=["Gotówka", "Karta"])
+            with gr.Column():
+                speaker_wav = gr.Textbox(label="Wav głosu", value="lector.wav")
+                set_voice = gr.Button("Ustaw głos")

-if __name__ == "__main__":
-    demo.launch()
+                # init settings
+                #assistant.set_tts_settings(speaker_wav.value)
+                set_voice_wav(speaker_wav.value)
+
+                #set_voice.click(assistant.set_tts_settings, inputs=[speaker_wav], outputs=[])
+                set_voice.click(set_voice_wav, inputs=[speaker_wav], outputs=[])
+        with gr.Row():
+            start_btn = gr.Button("Start Pizzobota")
+            stop_btn = gr.Button("Stop Pizzobota")
+
+        
+
+        start_btn.click(start_assistant, inputs=[phone_number, order_items, delivery_address, payment_method], outputs=[])
+        stop_btn.click(stop_assistant, inputs=[])
+
+    with gr.Tab("Advanced Settings"):
+        speech_recog_timeout = gr.Number(label="Speech Recog Timeout (sec)", value=1)
+        window_size_sec = gr.Number(label="Window Size (sec)", value=0.1)
+        vad_threshold = gr.Number(label="VAD Threshold", value=0.65)
+        min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=250)
+        speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=0)
+
+        # init settings
+        set_advanced_settings(speech_recog_timeout.value, window_size_sec.value, vad_threshold.value, min_silence_duration_ms.value, speech_pad_ms.value)
+
+        set_adv_btn = gr.Button("Ustaw")
+        set_adv_btn.click(set_advanced_settings, inputs=[speech_recog_timeout, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms], outputs=[])
+
+demo.launch()
--- a/frontend/chatgpt_wrap.py
+++ b/frontend/chatgpt_wrap.py
@@ -0,0 +1,101 @@
+import random
+from dupa import dupa
+import openai
+
+
+class ChatGPTWrap:
+    def __init__(self, use_chatgpt_placeholder = False):
+        self.use_chatgpt_placeholder = use_chatgpt_placeholder
+
+        # true chatgpt
+        if not use_chatgpt_placeholder:
+
+            print("Initializing ChatGPT... ", end="")
+            with open("system_instructions.txt", "r", encoding="utf-8") as f:
+                self.system_inst_template = f.read()
+            
+
+
+            #### true openai chat gpt initialization stuff below (everything that needs to be done only once) ####
+            #raise NotImplementedError("True ChatGPT is not implemented yet!")
+            self.client = openai.OpenAI(api_key=dupa)
+
+
+
+
+            print("Done!")
+
+        # placeholder chatgpt
+        else:
+            print("Using ChatGPT placeholder!")
+            self.message_idx = 0
+
+
+
+    def init_order(self, phone_number, order_items, delivery_address, payment_method):
+        self.phone_number = phone_number
+        self.order_items = order_items
+        self.delivery_address = delivery_address
+        self.payment_method = payment_method
+
+        # true chatgpt
+        if not self.use_chatgpt_placeholder:
+            # generate system instructions from template
+            self.system_inst = self.system_inst_template.format(
+                phone_number = self.phone_number,
+                order_items = self.order_items,
+                delivery_location = self.delivery_address,
+                payment_method = self.payment_method
+            )
+
+            print("System:")
+            print(self.system_inst)
+
+            #### true openai chat gpt system instructions initialization stuff below ####
+            ##### (everything that needs to be done each assistant session like some chat gpt conversation cleanup) ####
+
+            self.chat_history = [{
+                "role": "system",
+                "content": self.system_inst
+            }]
+
+
+    def get_response(self, input_message):
+        # true chatgpt
+        if not self.use_chatgpt_placeholder:
+            #### true openai chat gpt response stuff below ####
+
+            self.chat_history.append( 
+                {"role": "user", "content": input_message}, 
+            )
+
+            chat = self.client.chat.completions.create(
+                model="ft:gpt-3.5-turbo-1106:personal::8QBgBttE",
+                messages=self.chat_history,
+                temperature=0.57,
+                max_tokens=256,
+                top_p=1,
+                frequency_penalty=0,
+                presence_penalty=0
+            )
+
+            reply = chat.choices[0].message.content 
+
+            self.chat_history.append(
+                {"role": "assistant", "content": reply}
+            )
+            
+            return reply
+
+        # placeholder chatgpt
+        else:
+            choices = (
+                self.phone_number,
+                self.order_items,
+                self.delivery_address,
+                self.payment_method
+            )
+
+            self.message_idx += 1
+
+            return f"czat dżi pi ti plejsholder {random.choice(choices)}{' CALLEND' if self.message_idx == 3 else ''}"
--- a/frontend/chatgpt_wrap_test.ipynb
+++ b/frontend/chatgpt_wrap_test.ipynb
@@ -0,0 +1,134 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chatgpt_wrap import ChatGPTWrap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initializing ChatGPT... Done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "chatgpt = ChatGPTWrap()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "System:\n",
+      "You are a bot that will act as a guy that orders a pizza you will be connected on the phone with a pizza place. you will be asked questions about example: Where should be the pizza delivered etc. and you will respond with the data in the data section also you HAVE to respond in a full sentence because it will be transformed into audio using a tts software so you cant use a list just make a sentence like: i would like a margharitta and a cocacola please. basically just write it like you would say it dont put numbers but put words that are numbers dont add shortcuts add the full word\n",
+      "\n",
+      "DATA:\n",
+      "phone: 123456789\n",
+      "delivery location: ul. Amogusowa 1337, Suski Małe\n",
+      "paymentMethod: karta\n",
+      "OrderItems:\n",
+      "1x margharitta\n",
+      "2x sos majonezowy\n",
+      "\n",
+      "REMEMBER DONT USE NUMBERS, USE WORDS for example dont say 1x, say one time also REMEMBER to use replacement words to a word so it is appropriate to the whole sentence example:\n",
+      "WRONG:\n",
+      "Chciałbym zamówić jedną Margherittę, dwie Colę, pięć Fant i jedną Sprite.\n",
+      "RIGHT:\n",
+      "Chciałbym zamówić jedną Margarittę, dwie Kole, pięć Fant i jednego Sprajta.\n",
+      "\n",
+      "If the call ends, say at the end of your final response \"CALLEND\"\n"
+     ]
+    }
+   ],
+   "source": [
+    "chatgpt.init_order(\n",
+    "    phone_number = \"123456789\",\n",
+    "    order_items = \"1x margharitta\\n2x sos majonezowy\",\n",
+    "    delivery_address = \"ul. Amogusowa 1337, Suski Małe\",\n",
+    "    payment_method = \"karta\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "You >Witamy tu pizzer amogus\n",
+      "ChatGPT >Dzień dobry, chciałem złożyć zamówienie z dostawą do domu\n",
+      "You >dobrze. jakie to bedzie zamówienie?\n",
+      "ChatGPT >poproszę jedną margaritte i dwa sosy majonezowe\n",
+      "You >dobrze czy to wszystko?\n",
+      "ChatGPT >tak\n",
+      "You >jaki adres i płatność?\n",
+      "ChatGPT >adres to ulica amogusowa 1337 w suskach małych, a płatność będzie kartą\n",
+      "You >dobrze, pizza powinna byc gotowa za pół godziny\n",
+      "ChatGPT >super, dziękuje i do widzenia CALLEND\n",
+      "Conversation ended.\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    while True:\n",
+    "        message = input(\"You >\")\n",
+    "        print(\"You >\" + message)\n",
+    "\n",
+    "        if message == \"\":\n",
+    "            break\n",
+    "\n",
+    "        response = chatgpt.get_response(message)\n",
+    "\n",
+    "        print(\"ChatGPT >\" + response)\n",
+    "\n",
+    "        if \"CALLEND\" in response:\n",
+    "            print(\"Conversation ended.\")\n",
+    "            break\n",
+    "\n",
+    "except (KeyboardInterrupt, SystemExit):\n",
+    "    exit()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/frontend/core.py
+++ b/frontend/core.py
@@ -0,0 +1,191 @@
+from vad_recorder import VADRecorder
+from tts_stream import TTSStream
+from chatgpt_wrap import ChatGPTWrap
+from faster_whisper import WhisperModel
+import torch
+import time
+import numpy as np
+import re
+
+
+
+class Core:
+    def __init__(self, whisper_model_name = "large-v3", use_chatgpt_placeholder = False):
+        self.use_chatgpt_placeholder = use_chatgpt_placeholder
+
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print("\n=======================================")
+        print(f"Using {self.device.capitalize()} for:")
+        print(" - Faster Whisper")
+        print(" - TTS")
+        print("=======================================\n")
+
+        print("Loading Faster Whisper model... ", end="")
+        self.whisper_model = WhisperModel(whisper_model_name, device=self.device, compute_type="float16")
+        print("Done!")
+
+        # VADRecorder, TTSStream and ChatGPTWrap have their own console loading messages
+        self.vad_rec = VADRecorder()
+        self.tts = TTSStream(device=self.device)
+        self.gpt_wrap = ChatGPTWrap(use_chatgpt_placeholder)
+
+
+    def set_order_settings(self, phone_number, order_items, delivery_address, payment_method):
+        self.phone_number = phone_number
+        self.order_items = order_items
+        self.delivery_address = delivery_address
+        self.payment_method = payment_method
+
+
+    #def set_speech_recog_settings(self, speech_recog_timeout, audio_input_device_name, audio_output_device_name, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms):
+    def set_speech_recog_settings(self, speech_recog_timeout, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms):
+        self.speech_recog_timeout = speech_recog_timeout
+        #self.audio_input_device_name = audio_input_device_name
+        #self.audio_output_device_name = audio_output_device_name
+        self.window_size_sec = window_size_sec
+        self.vad_threshold = vad_threshold
+        self.min_silence_duration_ms = min_silence_duration_ms
+        self.speech_pad_ms = speech_pad_ms
+
+
+    def set_tts_settings(self, speaker_wav):
+        #self.speaker_wav = speaker_wav
+        print("Setting TTS speaker... ", end="")
+        self.tts.change_speaker(speaker_wav)
+        print("Done!")
+
+
+    def assistant_start(self):
+        print("Starting assistant...")
+
+        #print("Setting TTS speaker... ", end="")
+        #self.tts.change_speaker(self.speaker_wav)
+        #print("Done!")
+
+        print("Starting VAD recording thread... ", end="")
+        self.vad_rec.start_vad_recorder(
+            #target_device_name = self.audio_input_device_name,
+            window_size_sec = self.window_size_sec,
+            vad_threshold = self.vad_threshold,
+            min_silence_duration_ms = self.min_silence_duration_ms,
+            speech_pad_ms = self.speech_pad_ms
+        )
+        print("Done!")
+
+        self.gpt_wrap.init_order(
+            self.phone_number, 
+            self.order_items, 
+            self.delivery_address, 
+            self.payment_method
+        )
+
+
+
+        print("LISTENING!!!")
+
+
+        last_recog_time = time.perf_counter()
+        speech_recog_text = ""
+
+        self.assistant_running = True
+
+        while True:
+            if self.vad_rec.speech:
+                last_recog_time = time.perf_counter()
+
+            if len(self.vad_rec.audios_for_whisper) > 0:
+                #stream_out.write(audios_for_whisper.pop(0))
+                audio = np.array(self.vad_rec.audios_for_whisper.pop(0), dtype=np.float32)
+
+                segments, _ = self.whisper_model.transcribe(audio, language="pl")
+                if not self.assistant_running:
+                    break
+
+                text = "".join([segment.text for segment in segments])
+                #speech_recog_text += " " if len(speech_recog_text) else "" + text
+
+                if len(text) == 0:
+                    continue
+
+                if not text[-1] in ".,!?":
+                    text += "."
+
+                speech_recog_text += text.strip() + "\n"
+
+                print("=========================================")
+                print(text)
+
+                last_recog_time = time.perf_counter()
+
+
+
+            elif time.perf_counter() - last_recog_time > self.speech_recog_timeout and len(speech_recog_text) > 0:
+                speech_recog_text = speech_recog_text.strip()
+
+                print("=========================================\n\n")
+                print("-----------------------------------------")
+                print("!!!!!!!!!! SENDING TO CHATGPT !!!!!!!!!!!")
+                print("-----------------------------------------")
+                print(speech_recog_text)
+                print("-----------------------------------------\n\n")
+
+                
+
+                gpt_response = self.gpt_wrap.get_response(speech_recog_text)
+
+                # separate long sequences of numbers in text string (for example 123456789) into packets of 3 (123 456 789)
+                gpt_response = re.sub(r"(\d{3})(?=\d)", r"\1 ", gpt_response)
+
+                # Add space on the right side of numbers
+                gpt_response = re.sub(r'(\d)([^\d\s])', r'\1 \2', gpt_response)
+                # Add space on the left side of numbers
+                gpt_response = re.sub(r'([^\d\s])(\d)', r'\1 \2', gpt_response)
+
+                # replace "ul." with "ulica" (non case sensitive)
+                gpt_response = re.sub(r"ul\.", "ulica", gpt_response, flags=re.IGNORECASE)
+
+
+                print("-----------------------------------------")
+                if self.use_chatgpt_placeholder:
+                    print("!!!!! CHATGPT PLACEHOLDER RESPONSE !!!!!!")
+                else:
+                    print("!!!!!!!!!!! CHATGPT RESPONSE !!!!!!!!!!!!")
+                print("-----------------------------------------")
+                print(gpt_response)
+                print("-----------------------------------------\n\n")
+
+                if not self.assistant_running:
+                    break
+
+                speech_recog_text = ""
+
+
+                # tts
+                print("Speech synthesis stream started!")
+                self.tts.tts_speak(gpt_response.replace(" CALLEND", ""))
+
+
+                if "CALLEND" in gpt_response:
+                    self.assistant_stop()
+
+
+            #print(len(audios_for_whisper), time.perf_counter() - last_recog_time, len(speech_recog_text))
+
+
+            time.sleep(0.01)
+
+            if not self.assistant_running:
+                break
+
+        # set assistant_running back to True to indicate that the loop has exited
+
+    def assistant_stop(self):
+        print("Stopping assistant... ", end="")
+
+        self.assistant_running = False
+
+        self.vad_rec.stop_vad_recorder()
+        
+        print("Done!")
+
+
--- a/frontend/core_test.ipynb
+++ b/frontend/core_test.ipynb
@@ -0,0 +1,86 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from core import Core\n",
+    "\n",
+    "core = Core(\n",
+    "    whisper_model_name = \"large-v3\",\n",
+    "    use_chatgpt_placeholder = False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "core.set_order_settings(\n",
+    "    phone_number = \"123456789\",\n",
+    "    order_items = \"1x pizza margheritta 42 centrymetrów\\n1x sos pomidorowy\\n1x sos czosnkowy\",\n",
+    "    delivery_address = \"ul. Amogusowa 16A\",\n",
+    "    payment_method = \"gotówka\"\n",
+    ")\n",
+    "\n",
+    "core.set_speech_recog_settings(\n",
+    "    speech_recog_timeout = 1.0,\n",
+    "    #audio_input_device_name = \"Virtual\",\n",
+    "    #audio_output_device_name = \"placeholder\",\n",
+    "    window_size_sec = 0.1,\n",
+    "    vad_threshold = 0.65,\n",
+    "    min_silence_duration_ms = 250,\n",
+    "    speech_pad_ms = 0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "core.set_tts_settings(\n",
+    "    speaker_wav = \"voices/lector.wav\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    core.assistant_start()\n",
+    "except (KeyboardInterrupt, SystemExit):\n",
+    "    core.assistant_stop()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/frontend/system_instructions.txt
+++ b/frontend/system_instructions.txt
@@ -0,0 +1,16 @@
+You are a bot that will act as a guy that orders a pizza you will be connected on the phone with a pizza place. you will be asked questions about example: Where should be the pizza delivered etc. and you will respond with the data in the data section also you HAVE to respond in a full sentence because it will be transformed into audio using a tts software so you cant use a list just make a sentence like: i would like a margharitta and a cocacola please. basically just write it like you would say it dont put numbers but put words that are numbers dont add shortcuts add the full word
+
+DATA:
+phone: {phone_number}
+delivery location: {delivery_location} (PIZZA IS FOR DELIVERY. NOT for dine-in.)
+paymentMethod: {payment_method}
+OrderItems:
+{order_items}
+
+REMEMBER DONT USE NUMBERS, USE WORDS for example dont say 1x, say one time also REMEMBER to use replacement words to a word so it is appropriate to the whole sentence example:
+WRONG:
+Chciałbym zamówić jedną Margherittę, dwie Colę, pięć Fant i jedną Sprite.
+RIGHT:
+Chciałbym zamówić jedną Margarittę, dwie Kole, pięć Fant i jednego Sprajta.
+
+If the call ends, say at the end of your final response "CALLEND"
--- a/frontend/tts
+++ b/frontend/tts
@@ -6,9 +6,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from tts_stream import TTSstream\n",
+    "from tts_stream import TTSStream\n",
    "\n",
-    "tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
+    "tts = TTSStream(speaker_wav=\"voices/lector.wav\")"
   ]
  },
  {
@@ -27,7 +27,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznej sieci neuronowej.\")\n"
+    "tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznych sieci neuronowych.\")\n"
   ]
  }
 ],
--- a/frontend/tts_stream.py
+++ b/frontend/tts_stream.py
@@ -1,5 +1,4 @@
 import os
-import torch
 import pyaudio
 from TTS.api import TTS
 from TTS.tts.configs.xtts_config import XttsConfig
@@ -7,27 +6,35 @@ from TTS.tts.models.xtts import Xtts
 from TTS.utils.generic_utils import get_user_data_dir
 import threading
 import time
+import re
+

-# Check if CUDA is available
-if torch.cuda.is_available():
-    print("Using CUDA")
-    device = "cuda"
-else:
-    print("Using CPU")
-    device = "cpu"

 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"


-class TTSstream:
-    def __init__(self, speaker_wav):
+class TTSStream:
+    def __init__(self, speaker_wav=None, device=None):
        model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))

+        if device is None:
+            import torch
+
+            # Check if CUDA is available
+            if torch.cuda.is_available():
+                print("Using CUDA")
+                device = "cuda"
+            else:
+                print("Using CPU")
+                device = "cpu"
+
        #print(model_path)
+
+        print("Loading TTS model... ", end="")
 #
        # download model if it doesn't exist
        if not os.path.exists(os.path.join(model_path, "config.json")):
-            print("Downloading model...")
+            print("Downloading model... ", end="")
            tts = TTS()
            tts.download_model_by_name(model_name=model_name)

@@ -43,11 +50,16 @@ class TTSstream:
        )
        self.model.to(device)

+        print("Done!")

-        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
+        if speaker_wav is not None:
+            #self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
+            self.change_speaker(speaker_wav)

    def change_speaker(self, speaker_wav):
+        print("Loading speaker... ", end="")
        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
+        print("Done!")

    def _write_stream(self):
        # play first play_buffer_size samples and remove them from the buffer
@@ -58,6 +70,7 @@ class TTSstream:
                self.stream.write(self.chunk)
            else:
                if self.all_done:
+                    #self.thread_ended = True
                    break
                time.sleep(0.01)

@@ -65,42 +78,60 @@ class TTSstream:
    def tts_speak(self, text):
        self.play_buffer_size = 512

-        chunks = self.model.inference_stream(
-            text,
-            "pl",
-            self.gpt_cond_latent,
-            self.speaker_embedding,
-            stream_chunk_size=20,
-        )
-

        # open pyaudio stream
        p = pyaudio.PyAudio()
        self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True)

+        # for each sentence ending with . or ! or ?
+        for text in re.split(r"(?<=[.!?])", text):
+            text = text.strip()

-        self.chunks_bin = b""
-        self.all_done = False
-
-        # run write_stream as thread
-        thread = threading.Thread(target=self._write_stream)
-        thread.start()
-
-        while True:
-            try:
-                # read chunks from chunks generator as they are generated
-                for self.chunk in chunks:
-                    self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
-                break
-            # some weird error caused by coqui-tts
-            except:
-                print("Error occured when generating audio stream. Retrying...")
+            if len(text) == 0:
                continue

-        self.all_done = True
+            chunks = self.model.inference_stream(
+                text,
+                "pl",
+                self.gpt_cond_latent,
+                self.speaker_embedding,
+                stream_chunk_size=20,
+            )

-        # wait for thread to finish
-        thread.join()
+
+            self.chunks_bin = b""
+            self.all_done = False
+
+            # run write_stream as thread
+            #self.thread_ended = False
+            thread = threading.Thread(target=self._write_stream)
+            thread.start()
+
+            while True:
+                try:
+                    # read chunks from chunks generator as they are generated
+                    for self.chunk in chunks:
+                        self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
+                    break
+                # some weird error caused by coqui-tts
+                except:
+                    print("Error occured when generating audio stream. Retrying...")
+                    continue
+
+            self.all_done = True
+
+            # wait for thread to finish
+            thread.join()
+
+            # wait for thread ended
+            #while not self.thread_ended:
+            #    time.sleep(0.01)
+            
+            #while True:
+            #    if self.thread_ended:
+            #        break
+            #    print("Waiting for thread to end...")
+            #    time.sleep(0.01)

        self.stream.close()
        p.terminate()
--- a/frontend/vad_recorder.py
+++ b/frontend/vad_recorder.py
@@ -0,0 +1,152 @@
+import torch
+import pyaudio
+import numpy as np
+import time
+import onnxruntime as ort
+import threading
+
+ort.set_default_logger_severity(3)
+
+
+SAMPLERATE = 16000
+
+class VADRecorder:
+    #def __init__(self, target_device_name, window_size_sec = 0.2, use_onnx = True):
+    def __init__(self, use_onnx = True):
+
+        print("Loading Silero VAD model... ", end="")
+
+        self.vad_model, utils = torch.hub.load(
+            repo_or_dir="snakers4/silero-vad",
+            model="silero_vad",
+            force_reload=False,
+            onnx=use_onnx
+        )
+
+        (
+            _, # get_speech_timestamps
+            _, # save_audio
+            _, # read_audio
+            self.VADIterator,
+            _ # collect_chunks
+        ) = utils
+
+        print("Done!")
+
+        self.vad_iterator = None
+
+
+
+    def _vad_recorder(self):
+        print("Listening...")
+
+        speech_win = 0
+        detected_audio = []
+
+        last_chunk = np.zeros(self.window_size, dtype=np.float32)
+
+        # Vad iterator needs to be reloaded because after running for a while, it freaks out and hallucinates speech.
+        vad_iter_reload_delay = 60 * 2
+        vad_iter_load_time = time.time()
+
+
+        self.vad_iterator = self.VADIterator(
+            self.vad_model,
+            threshold = self.vad_threshold,
+            sampling_rate = SAMPLERATE,
+            min_silence_duration_ms = self.min_silence_duration_ms,
+            speech_pad_ms = self.speech_pad_ms
+        )
+
+
+        while self.rec_flag:
+            chunk = np.frombuffer(self.stream_in.read(self.window_size), dtype=np.float32)
+
+            speech_dict = self.vad_iterator(chunk)
+
+            # check if speech_dict is {"start": x} ir {"end": x}
+            if speech_dict is not None:
+                self.speech = "start" in speech_dict
+
+            if self.speech:
+                #print("Speech detected!")
+                if speech_win == 0:
+                    detected_audio = last_chunk.tolist()
+                speech_win += 1
+                detected_audio += chunk.tolist()
+
+            else:
+                if time.time() - vad_iter_load_time > vad_iter_reload_delay:
+                    self.vad_iterator.reset_states()
+
+                    vad_iter_load_time = time.time()
+
+                    self.vad_iterator = self.VADIterator(
+                        self.vad_model,
+                        threshold = self.vad_threshold,
+                        sampling_rate = SAMPLERATE,
+                        min_silence_duration_ms = self.min_silence_duration_ms,
+                        speech_pad_ms = self.speech_pad_ms
+                    )
+
+                    print("Reloaded VADIterator!")
+
+                if speech_win > 0:
+                    speech_win = 0
+
+                    self.audios_for_whisper.append(detected_audio)
+        
+            last_chunk = chunk.copy()
+
+
+
+
+    #def start_vad_recorder(self, target_device_name, window_size_sec = 0.1, vad_threshold = 0.6, min_silence_duration_ms = 150, speech_pad_ms = 0):
+    def start_vad_recorder(self, window_size_sec = 0.1, vad_threshold = 0.6, min_silence_duration_ms = 150, speech_pad_ms = 0):
+
+        self.window_size = int(window_size_sec * SAMPLERATE)
+
+        self.vad_threshold = vad_threshold
+        self.min_silence_duration_ms = min_silence_duration_ms
+        self.speech_pad_ms = speech_pad_ms
+
+
+        self.p = pyaudio.PyAudio()
+
+        #target_device_index = None
+        #for i in range(self.p.get_device_count()):
+        #    device_info = self.p.get_device_info_by_index(i)
+        #    if device_info['maxInputChannels'] > 0 and target_device_name in device_info['name']:
+        #        target_device_index = i
+        #        break
+        #
+        #if target_device_index is None:
+        #    print(f"No target device found with \"{target_device_name}\" in its name.")
+        #    exit()
+        #
+        #try:
+        #    self.stream_in = self.p.open(format=pyaudio.paFloat32, channels=1, rate=SAMPLERATE, input=True, frames_per_buffer=self.window_size, input_device_index=target_device_index)
+        #except OSError:
+        #    print(f"An unexpected error occured when trying to open device stream with \"{target_device_name}\" in its name. That could be caused by the device being disabled or unplugged.")
+        #    exit()
+
+        self.stream_in = self.p.open(format=pyaudio.paFloat32, channels=1, rate=SAMPLERATE, input=True, frames_per_buffer=self.window_size)
+
+        self.speech = False
+        self.audios_for_whisper = []
+
+
+        if self.vad_iterator is not None:
+            self.vad_iterator.reset_states()
+
+        self.rec_flag = True
+        self.vad_rec_thread = threading.Thread(target=self._vad_recorder, daemon=True)
+        self.vad_rec_thread.start()
+
+    def stop_vad_recorder(self):
+        self.rec_flag = False
+        self.vad_rec_thread.join()
+        
+        self.stream_in.stop_stream()
+        self.stream_in.close()
+        self.p.terminate()
--- a/frontend/vad_recorder_test.ipynb
+++ b/frontend/vad_recorder_test.ipynb
@@ -0,0 +1,69 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vad_recorder import VADRecorder\n",
+    "import time\n",
+    "\n",
+    "# Instantiate WhisperWrap with the target device name and other parameters\n",
+    "vad_rec = VADRecorder()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Starting VAD recording thread\")\n",
+    "\n",
+    "try:\n",
+    "    vad_rec.start_vad_recorder(target_device_name=\"Virtual\")\n",
+    "\n",
+    "\n",
+    "    recordings_count = 0\n",
+    "    now_speech = False\n",
+    "\n",
+    "    print(\"Done!\")\n",
+    "    while True:\n",
+    "        if vad_rec.speech != now_speech:\n",
+    "            now_speech = vad_rec.speech\n",
+    "            print(f\"Speech: {now_speech}\")\n",
+    "        if len(vad_rec.audios_for_whisper) != recordings_count:\n",
+    "            recordings_count = len(vad_rec.audios_for_whisper)\n",
+    "            print(f\"Recordings count: {recordings_count}\")\n",
+    "\n",
+    "        time.sleep(0.01)\n",
+    "\n",
+    "except (KeyboardInterrupt, SystemExit):\n",
+    "    print(\"Cleaning up...\")\n",
+    "    vad_rec.stop_vad_recorder()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/frontend/whisper_tts_test.ipynb
+++ b/frontend/whisper_tts_test.ipynb
@@ -1,126 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import whisper\n",
-    "from tts_stream import TTSstream"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sounddevice as sd\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "import torch\n",
-    "\n",
-    "# force matplotlib gui backend\n",
-    "import matplotlib\n",
-    "matplotlib.use('TkAgg')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"Loading whisper model...\")\n",
-    "\n",
-    "if torch.cuda.is_available():\n",
-    "    print(\"using CUDA\")\n",
-    "    device = \"cuda\"\n",
-    "else:\n",
-    "    print(\"using CPU\")\n",
-    "    device = \"cpu\"\n",
-    "\n",
-    "model = whisper.load_model(\"medium\").to(device)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# optional for changing speaker to some another one\n",
-    "tts.change_speaker(\"voices/speaker_name.wav\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# record 10 seconds of audio with sounddevice\n",
-    "print(\"Recording audio...\")\n",
-    "\n",
-    "fs = 16000\n",
-    "duration = 4\n",
-    "frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n",
-    "sd.wait()\n",
-    "\n",
-    "\n",
-    "\n",
-    "frames = frames[:, 0]\n",
-    "#frames /= np.max(np.abs(frames))\n",
-    "\n",
-    "\n",
-    "## plot audio\n",
-    "#plt.plot(frames)\n",
-    "## set plot range to -1, 1\n",
-    "#plt.ylim(-1, 1)\n",
-    "#plt.show()\n",
-    "\n",
-    "# recognize text from audio\n",
-    "print(\"Recognizing text...\")\n",
-    "\n",
-    "result = model.transcribe(frames, language=\"pl\", fp16=False)\n",
-    "whisper_text = result[\"text\"]\n",
-    "print(whisper_text)\n",
-    "\n",
-    "# synthesize text to audio\n",
-    "print(\"Synthesizing audio...\")\n",
-    "tts.tts_speak(whisper_text)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/player/udp
+++ b/player/udp
@@ -0,0 +1,68 @@
+import socket
+import psutil
+import pyaudio
+
+# This streamer can be used to preview assistant audio for example for presentation.
+# For this you have to set OBS to following ffmpeg streaming settings:
+# 48khz
+# stereo
+# PCM signed 16bit little endian
+
+
+
+# get local ip adress of main network interface
+interfaces = psutil.net_if_addrs()
+for interface, addrs in interfaces.items():
+    if "Wi-Fi" in interface:
+        print("name: ", interface)
+        for addr in addrs:
+            if addr.family == socket.AF_INET:
+                UDP_IP = addr.address
+                print("ip: ", UDP_IP)
+                break
+        break
+
+UDP_PORT = 8081
+
+print("port: ", UDP_PORT)
+
+# PyAudio parameters
+CHANNELS = 2
+
+# 16-bit audio
+WIDTH = 2
+RATE = 48000
+
+BUFFER_SIZE = 2048
+
+
+# Create a UDP socket
+sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+sock.bind((UDP_IP, UDP_PORT))
+
+# Initialize PyAudio
+p = pyaudio.PyAudio()
+stream = p.open(format=p.get_format_from_width(WIDTH),
+                channels=CHANNELS,
+                rate=RATE,
+                output=True)
+
+print("Stream started. Press Ctrl+C to stop.")
+
+try:
+    while True:
+        # Receive audio data
+        data, addr = sock.recvfrom(BUFFER_SIZE)
+        
+        # Play the audio data
+        stream.write(data)
+
+except KeyboardInterrupt:
+    print("\nStreaming stopped.")
+
+finally:
+    # Close the stream and socket
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+    sock.close()
Author	SHA1	Message	Date
Looki2000	26118e4898	AAAAAA	2023-12-06 01:19:49 +01:00
Looki2000	8a7573f169	Added UDP raw audio stream player script	2023-12-05 01:40:59 +01:00
Looki2000	0a82dac47a	removed unused and unnecesary functionality	2023-12-04 23:36:07 +01:00
Looki2000	20238a31bf	small fixes	2023-12-04 23:07:43 +01:00
Looki2000	345de72aa1	followup to the last commit	2023-12-04 14:35:59 +01:00
Looki2000	610fcdf146	added chatgpt wrap tester	2023-12-04 14:35:35 +01:00
Looki2000	b631ce340d	fixed VERY CRITICAL system instructions formating error	2023-12-04 14:33:47 +01:00
lubek	fdfe324f05	GÓWNO	2023-12-04 14:12:07 +01:00
Looki2000	24f909abff	added chatgpt wrap, tts phone number splitting	2023-12-02 17:32:01 +01:00
Looki2000	f18e41476f	huge commit. assistant code is now semifunctional	2023-12-01 21:48:56 +01:00