Compare commits
10 Commits
af6f2a13b7
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
26118e4898 | ||
|
|
8a7573f169 | ||
|
|
0a82dac47a | ||
|
|
20238a31bf | ||
| 345de72aa1 | |||
| 610fcdf146 | |||
| b631ce340d | |||
|
|
fdfe324f05 | ||
|
|
24f909abff | ||
|
|
f18e41476f |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,9 +1,10 @@
|
||||
finetune_dialogs_tool/temp/
|
||||
finetune_dialogs_tool/output_dialogs/
|
||||
finetune_dialogs_tool/__pycache__/
|
||||
|
||||
frontend/voices/*
|
||||
!frontend/voices/lector.wav
|
||||
!frontend/voices/lector source.txt
|
||||
|
||||
frontend/__pycache__/
|
||||
frontend/dupa.py
|
||||
|
||||
*/__pycache__/
|
||||
@@ -1,21 +1,80 @@
|
||||
import gradio as gr
|
||||
from core import Core
|
||||
import os
|
||||
|
||||
def start_assistant(phone_number, order_items, delivery_address, payment_method):
|
||||
global assistant
|
||||
assistant.set_order_settings(
|
||||
phone_number,
|
||||
order_items,
|
||||
delivery_address,
|
||||
payment_method
|
||||
)
|
||||
assistant.assistant_start()
|
||||
|
||||
def proceed():
|
||||
print("ok")
|
||||
|
||||
def stop_assistant():
|
||||
assistant.assistant_stop()
|
||||
|
||||
|
||||
def set_advanced_settings(speech_recog_timeout, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms):
|
||||
assistant.set_speech_recog_settings(
|
||||
speech_recog_timeout,
|
||||
window_size_sec,
|
||||
vad_threshold,
|
||||
min_silence_duration_ms,
|
||||
speech_pad_ms
|
||||
)
|
||||
|
||||
def set_voice_wav(speaker_wav):
|
||||
speaker_wav = os.path.join(cwd, "voices", speaker_wav)
|
||||
assistant.set_tts_settings(speaker_wav)
|
||||
|
||||
|
||||
assistant = Core(
|
||||
use_chatgpt_placeholder = True
|
||||
)
|
||||
|
||||
cwd = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
with gr.Blocks() as demo:
|
||||
txt_2 = gr.Textbox(label="Podaj Twój nr telefonu", lines=1)
|
||||
txt_2 = gr.Textbox(label="Co chcesz zamówić?", lines=2)
|
||||
txt_3 = gr.Textbox(label="Na jaki adres?")
|
||||
txt_4 = gr.Textbox(label="Dodatkowe informacje", lines=2)
|
||||
gr.Dropdown(["Wejściowe", "Wyjściowe"], label="Urządzenie", info="Wybierz urządzenie audio!"),
|
||||
gr.Radio(["inteigentna osoba", "50/50", "głupek"], label="Jaki rodzaj osoby udawać"),
|
||||
gr.Radio(["hitler", "stuu", "lektor","belmondawg","sasza", "villager"], label="Głos", info="Jakiego głosu użyć?"),
|
||||
btn = gr.Button(value="Submit")
|
||||
btn.click(proceed)
|
||||
with gr.Tab("Basic Settings"):
|
||||
with gr.Row():
|
||||
phone_number = gr.Textbox(label="Twój Nr. Telefonu")
|
||||
order_items = gr.Textbox(label="Zamówienie", lines=5)
|
||||
delivery_address = gr.Textbox(label="Adres dostawy")
|
||||
payment_method = gr.Dropdown(label="Metoda płatności", choices=["Gotówka", "Karta"])
|
||||
with gr.Column():
|
||||
speaker_wav = gr.Textbox(label="Wav głosu", value="lector.wav")
|
||||
set_voice = gr.Button("Ustaw głos")
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo.launch()
|
||||
# init settings
|
||||
#assistant.set_tts_settings(speaker_wav.value)
|
||||
set_voice_wav(speaker_wav.value)
|
||||
|
||||
#set_voice.click(assistant.set_tts_settings, inputs=[speaker_wav], outputs=[])
|
||||
set_voice.click(set_voice_wav, inputs=[speaker_wav], outputs=[])
|
||||
with gr.Row():
|
||||
start_btn = gr.Button("Start Pizzobota")
|
||||
stop_btn = gr.Button("Stop Pizzobota")
|
||||
|
||||
|
||||
|
||||
start_btn.click(start_assistant, inputs=[phone_number, order_items, delivery_address, payment_method], outputs=[])
|
||||
stop_btn.click(stop_assistant, inputs=[])
|
||||
|
||||
with gr.Tab("Advanced Settings"):
|
||||
speech_recog_timeout = gr.Number(label="Speech Recog Timeout (sec)", value=1)
|
||||
window_size_sec = gr.Number(label="Window Size (sec)", value=0.1)
|
||||
vad_threshold = gr.Number(label="VAD Threshold", value=0.65)
|
||||
min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=250)
|
||||
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=0)
|
||||
|
||||
# init settings
|
||||
set_advanced_settings(speech_recog_timeout.value, window_size_sec.value, vad_threshold.value, min_silence_duration_ms.value, speech_pad_ms.value)
|
||||
|
||||
set_adv_btn = gr.Button("Ustaw")
|
||||
set_adv_btn.click(set_advanced_settings, inputs=[speech_recog_timeout, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms], outputs=[])
|
||||
|
||||
demo.launch()
|
||||
101
frontend/chatgpt_wrap.py
Normal file
101
frontend/chatgpt_wrap.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import random
|
||||
from dupa import dupa
|
||||
import openai
|
||||
|
||||
|
||||
class ChatGPTWrap:
|
||||
def __init__(self, use_chatgpt_placeholder = False):
|
||||
self.use_chatgpt_placeholder = use_chatgpt_placeholder
|
||||
|
||||
# true chatgpt
|
||||
if not use_chatgpt_placeholder:
|
||||
|
||||
print("Initializing ChatGPT... ", end="")
|
||||
with open("system_instructions.txt", "r", encoding="utf-8") as f:
|
||||
self.system_inst_template = f.read()
|
||||
|
||||
|
||||
|
||||
#### true openai chat gpt initialization stuff below (everything that needs to be done only once) ####
|
||||
#raise NotImplementedError("True ChatGPT is not implemented yet!")
|
||||
self.client = openai.OpenAI(api_key=dupa)
|
||||
|
||||
|
||||
|
||||
|
||||
print("Done!")
|
||||
|
||||
# placeholder chatgpt
|
||||
else:
|
||||
print("Using ChatGPT placeholder!")
|
||||
self.message_idx = 0
|
||||
|
||||
|
||||
|
||||
def init_order(self, phone_number, order_items, delivery_address, payment_method):
|
||||
self.phone_number = phone_number
|
||||
self.order_items = order_items
|
||||
self.delivery_address = delivery_address
|
||||
self.payment_method = payment_method
|
||||
|
||||
# true chatgpt
|
||||
if not self.use_chatgpt_placeholder:
|
||||
# generate system instructions from template
|
||||
self.system_inst = self.system_inst_template.format(
|
||||
phone_number = self.phone_number,
|
||||
order_items = self.order_items,
|
||||
delivery_location = self.delivery_address,
|
||||
payment_method = self.payment_method
|
||||
)
|
||||
|
||||
print("System:")
|
||||
print(self.system_inst)
|
||||
|
||||
#### true openai chat gpt system instructions initialization stuff below ####
|
||||
##### (everything that needs to be done each assistant session like some chat gpt conversation cleanup) ####
|
||||
|
||||
self.chat_history = [{
|
||||
"role": "system",
|
||||
"content": self.system_inst
|
||||
}]
|
||||
|
||||
|
||||
def get_response(self, input_message):
|
||||
# true chatgpt
|
||||
if not self.use_chatgpt_placeholder:
|
||||
#### true openai chat gpt response stuff below ####
|
||||
|
||||
self.chat_history.append(
|
||||
{"role": "user", "content": input_message},
|
||||
)
|
||||
|
||||
chat = self.client.chat.completions.create(
|
||||
model="ft:gpt-3.5-turbo-1106:personal::8QBgBttE",
|
||||
messages=self.chat_history,
|
||||
temperature=0.57,
|
||||
max_tokens=256,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0
|
||||
)
|
||||
|
||||
reply = chat.choices[0].message.content
|
||||
|
||||
self.chat_history.append(
|
||||
{"role": "assistant", "content": reply}
|
||||
)
|
||||
|
||||
return reply
|
||||
|
||||
# placeholder chatgpt
|
||||
else:
|
||||
choices = (
|
||||
self.phone_number,
|
||||
self.order_items,
|
||||
self.delivery_address,
|
||||
self.payment_method
|
||||
)
|
||||
|
||||
self.message_idx += 1
|
||||
|
||||
return f"czat dżi pi ti plejsholder {random.choice(choices)}{' CALLEND' if self.message_idx == 3 else ''}"
|
||||
134
frontend/chatgpt_wrap_test.ipynb
Normal file
134
frontend/chatgpt_wrap_test.ipynb
Normal file
@@ -0,0 +1,134 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from chatgpt_wrap import ChatGPTWrap"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Initializing ChatGPT... Done!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chatgpt = ChatGPTWrap()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"System:\n",
|
||||
"You are a bot that will act as a guy that orders a pizza you will be connected on the phone with a pizza place. you will be asked questions about example: Where should be the pizza delivered etc. and you will respond with the data in the data section also you HAVE to respond in a full sentence because it will be transformed into audio using a tts software so you cant use a list just make a sentence like: i would like a margharitta and a cocacola please. basically just write it like you would say it dont put numbers but put words that are numbers dont add shortcuts add the full word\n",
|
||||
"\n",
|
||||
"DATA:\n",
|
||||
"phone: 123456789\n",
|
||||
"delivery location: ul. Amogusowa 1337, Suski Małe\n",
|
||||
"paymentMethod: karta\n",
|
||||
"OrderItems:\n",
|
||||
"1x margharitta\n",
|
||||
"2x sos majonezowy\n",
|
||||
"\n",
|
||||
"REMEMBER DONT USE NUMBERS, USE WORDS for example dont say 1x, say one time also REMEMBER to use replacement words to a word so it is appropriate to the whole sentence example:\n",
|
||||
"WRONG:\n",
|
||||
"Chciałbym zamówić jedną Margherittę, dwie Colę, pięć Fant i jedną Sprite.\n",
|
||||
"RIGHT:\n",
|
||||
"Chciałbym zamówić jedną Margarittę, dwie Kole, pięć Fant i jednego Sprajta.\n",
|
||||
"\n",
|
||||
"If the call ends, say at the end of your final response \"CALLEND\"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chatgpt.init_order(\n",
|
||||
" phone_number = \"123456789\",\n",
|
||||
" order_items = \"1x margharitta\\n2x sos majonezowy\",\n",
|
||||
" delivery_address = \"ul. Amogusowa 1337, Suski Małe\",\n",
|
||||
" payment_method = \"karta\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"You >Witamy tu pizzer amogus\n",
|
||||
"ChatGPT >Dzień dobry, chciałem złożyć zamówienie z dostawą do domu\n",
|
||||
"You >dobrze. jakie to bedzie zamówienie?\n",
|
||||
"ChatGPT >poproszę jedną margaritte i dwa sosy majonezowe\n",
|
||||
"You >dobrze czy to wszystko?\n",
|
||||
"ChatGPT >tak\n",
|
||||
"You >jaki adres i płatność?\n",
|
||||
"ChatGPT >adres to ulica amogusowa 1337 w suskach małych, a płatność będzie kartą\n",
|
||||
"You >dobrze, pizza powinna byc gotowa za pół godziny\n",
|
||||
"ChatGPT >super, dziękuje i do widzenia CALLEND\n",
|
||||
"Conversation ended.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" while True:\n",
|
||||
" message = input(\"You >\")\n",
|
||||
" print(\"You >\" + message)\n",
|
||||
"\n",
|
||||
" if message == \"\":\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" response = chatgpt.get_response(message)\n",
|
||||
"\n",
|
||||
" print(\"ChatGPT >\" + response)\n",
|
||||
"\n",
|
||||
" if \"CALLEND\" in response:\n",
|
||||
" print(\"Conversation ended.\")\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"except (KeyboardInterrupt, SystemExit):\n",
|
||||
" exit()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
191
frontend/core.py
Normal file
191
frontend/core.py
Normal file
@@ -0,0 +1,191 @@
|
||||
from vad_recorder import VADRecorder
|
||||
from tts_stream import TTSStream
|
||||
from chatgpt_wrap import ChatGPTWrap
|
||||
from faster_whisper import WhisperModel
|
||||
import torch
|
||||
import time
|
||||
import numpy as np
|
||||
import re
|
||||
|
||||
|
||||
|
||||
class Core:
|
||||
def __init__(self, whisper_model_name = "large-v3", use_chatgpt_placeholder = False):
|
||||
self.use_chatgpt_placeholder = use_chatgpt_placeholder
|
||||
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
print("\n=======================================")
|
||||
print(f"Using {self.device.capitalize()} for:")
|
||||
print(" - Faster Whisper")
|
||||
print(" - TTS")
|
||||
print("=======================================\n")
|
||||
|
||||
print("Loading Faster Whisper model... ", end="")
|
||||
self.whisper_model = WhisperModel(whisper_model_name, device=self.device, compute_type="float16")
|
||||
print("Done!")
|
||||
|
||||
# VADRecorder, TTSStream and ChatGPTWrap have their own console loading messages
|
||||
self.vad_rec = VADRecorder()
|
||||
self.tts = TTSStream(device=self.device)
|
||||
self.gpt_wrap = ChatGPTWrap(use_chatgpt_placeholder)
|
||||
|
||||
|
||||
def set_order_settings(self, phone_number, order_items, delivery_address, payment_method):
|
||||
self.phone_number = phone_number
|
||||
self.order_items = order_items
|
||||
self.delivery_address = delivery_address
|
||||
self.payment_method = payment_method
|
||||
|
||||
|
||||
#def set_speech_recog_settings(self, speech_recog_timeout, audio_input_device_name, audio_output_device_name, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms):
|
||||
def set_speech_recog_settings(self, speech_recog_timeout, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms):
|
||||
self.speech_recog_timeout = speech_recog_timeout
|
||||
#self.audio_input_device_name = audio_input_device_name
|
||||
#self.audio_output_device_name = audio_output_device_name
|
||||
self.window_size_sec = window_size_sec
|
||||
self.vad_threshold = vad_threshold
|
||||
self.min_silence_duration_ms = min_silence_duration_ms
|
||||
self.speech_pad_ms = speech_pad_ms
|
||||
|
||||
|
||||
def set_tts_settings(self, speaker_wav):
|
||||
#self.speaker_wav = speaker_wav
|
||||
print("Setting TTS speaker... ", end="")
|
||||
self.tts.change_speaker(speaker_wav)
|
||||
print("Done!")
|
||||
|
||||
|
||||
def assistant_start(self):
|
||||
print("Starting assistant...")
|
||||
|
||||
#print("Setting TTS speaker... ", end="")
|
||||
#self.tts.change_speaker(self.speaker_wav)
|
||||
#print("Done!")
|
||||
|
||||
print("Starting VAD recording thread... ", end="")
|
||||
self.vad_rec.start_vad_recorder(
|
||||
#target_device_name = self.audio_input_device_name,
|
||||
window_size_sec = self.window_size_sec,
|
||||
vad_threshold = self.vad_threshold,
|
||||
min_silence_duration_ms = self.min_silence_duration_ms,
|
||||
speech_pad_ms = self.speech_pad_ms
|
||||
)
|
||||
print("Done!")
|
||||
|
||||
self.gpt_wrap.init_order(
|
||||
self.phone_number,
|
||||
self.order_items,
|
||||
self.delivery_address,
|
||||
self.payment_method
|
||||
)
|
||||
|
||||
|
||||
|
||||
print("LISTENING!!!")
|
||||
|
||||
|
||||
last_recog_time = time.perf_counter()
|
||||
speech_recog_text = ""
|
||||
|
||||
self.assistant_running = True
|
||||
|
||||
while True:
|
||||
if self.vad_rec.speech:
|
||||
last_recog_time = time.perf_counter()
|
||||
|
||||
if len(self.vad_rec.audios_for_whisper) > 0:
|
||||
#stream_out.write(audios_for_whisper.pop(0))
|
||||
audio = np.array(self.vad_rec.audios_for_whisper.pop(0), dtype=np.float32)
|
||||
|
||||
segments, _ = self.whisper_model.transcribe(audio, language="pl")
|
||||
if not self.assistant_running:
|
||||
break
|
||||
|
||||
text = "".join([segment.text for segment in segments])
|
||||
#speech_recog_text += " " if len(speech_recog_text) else "" + text
|
||||
|
||||
if len(text) == 0:
|
||||
continue
|
||||
|
||||
if not text[-1] in ".,!?":
|
||||
text += "."
|
||||
|
||||
speech_recog_text += text.strip() + "\n"
|
||||
|
||||
print("=========================================")
|
||||
print(text)
|
||||
|
||||
last_recog_time = time.perf_counter()
|
||||
|
||||
|
||||
|
||||
elif time.perf_counter() - last_recog_time > self.speech_recog_timeout and len(speech_recog_text) > 0:
|
||||
speech_recog_text = speech_recog_text.strip()
|
||||
|
||||
print("=========================================\n\n")
|
||||
print("-----------------------------------------")
|
||||
print("!!!!!!!!!! SENDING TO CHATGPT !!!!!!!!!!!")
|
||||
print("-----------------------------------------")
|
||||
print(speech_recog_text)
|
||||
print("-----------------------------------------\n\n")
|
||||
|
||||
|
||||
|
||||
gpt_response = self.gpt_wrap.get_response(speech_recog_text)
|
||||
|
||||
# separate long sequences of numbers in text string (for example 123456789) into packets of 3 (123 456 789)
|
||||
gpt_response = re.sub(r"(\d{3})(?=\d)", r"\1 ", gpt_response)
|
||||
|
||||
# Add space on the right side of numbers
|
||||
gpt_response = re.sub(r'(\d)([^\d\s])', r'\1 \2', gpt_response)
|
||||
# Add space on the left side of numbers
|
||||
gpt_response = re.sub(r'([^\d\s])(\d)', r'\1 \2', gpt_response)
|
||||
|
||||
# replace "ul." with "ulica" (non case sensitive)
|
||||
gpt_response = re.sub(r"ul\.", "ulica", gpt_response, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
print("-----------------------------------------")
|
||||
if self.use_chatgpt_placeholder:
|
||||
print("!!!!! CHATGPT PLACEHOLDER RESPONSE !!!!!!")
|
||||
else:
|
||||
print("!!!!!!!!!!! CHATGPT RESPONSE !!!!!!!!!!!!")
|
||||
print("-----------------------------------------")
|
||||
print(gpt_response)
|
||||
print("-----------------------------------------\n\n")
|
||||
|
||||
if not self.assistant_running:
|
||||
break
|
||||
|
||||
speech_recog_text = ""
|
||||
|
||||
|
||||
# tts
|
||||
print("Speech synthesis stream started!")
|
||||
self.tts.tts_speak(gpt_response.replace(" CALLEND", ""))
|
||||
|
||||
|
||||
if "CALLEND" in gpt_response:
|
||||
self.assistant_stop()
|
||||
|
||||
|
||||
#print(len(audios_for_whisper), time.perf_counter() - last_recog_time, len(speech_recog_text))
|
||||
|
||||
|
||||
time.sleep(0.01)
|
||||
|
||||
if not self.assistant_running:
|
||||
break
|
||||
|
||||
# set assistant_running back to True to indicate that the loop has exited
|
||||
|
||||
def assistant_stop(self):
|
||||
print("Stopping assistant... ", end="")
|
||||
|
||||
self.assistant_running = False
|
||||
|
||||
self.vad_rec.stop_vad_recorder()
|
||||
|
||||
print("Done!")
|
||||
|
||||
|
||||
86
frontend/core_test.ipynb
Normal file
86
frontend/core_test.ipynb
Normal file
@@ -0,0 +1,86 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from core import Core\n",
|
||||
"\n",
|
||||
"core = Core(\n",
|
||||
" whisper_model_name = \"large-v3\",\n",
|
||||
" use_chatgpt_placeholder = False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"core.set_order_settings(\n",
|
||||
" phone_number = \"123456789\",\n",
|
||||
" order_items = \"1x pizza margheritta 42 centrymetrów\\n1x sos pomidorowy\\n1x sos czosnkowy\",\n",
|
||||
" delivery_address = \"ul. Amogusowa 16A\",\n",
|
||||
" payment_method = \"gotówka\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"core.set_speech_recog_settings(\n",
|
||||
" speech_recog_timeout = 1.0,\n",
|
||||
" #audio_input_device_name = \"Virtual\",\n",
|
||||
" #audio_output_device_name = \"placeholder\",\n",
|
||||
" window_size_sec = 0.1,\n",
|
||||
" vad_threshold = 0.65,\n",
|
||||
" min_silence_duration_ms = 250,\n",
|
||||
" speech_pad_ms = 0\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"core.set_tts_settings(\n",
|
||||
" speaker_wav = \"voices/lector.wav\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" core.assistant_start()\n",
|
||||
"except (KeyboardInterrupt, SystemExit):\n",
|
||||
" core.assistant_stop()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
16
frontend/system_instructions.txt
Normal file
16
frontend/system_instructions.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
You are a bot that will act as a guy that orders a pizza you will be connected on the phone with a pizza place. you will be asked questions about example: Where should be the pizza delivered etc. and you will respond with the data in the data section also you HAVE to respond in a full sentence because it will be transformed into audio using a tts software so you cant use a list just make a sentence like: i would like a margharitta and a cocacola please. basically just write it like you would say it dont put numbers but put words that are numbers dont add shortcuts add the full word
|
||||
|
||||
DATA:
|
||||
phone: {phone_number}
|
||||
delivery location: {delivery_location} (PIZZA IS FOR DELIVERY. NOT for dine-in.)
|
||||
paymentMethod: {payment_method}
|
||||
OrderItems:
|
||||
{order_items}
|
||||
|
||||
REMEMBER DONT USE NUMBERS, USE WORDS for example dont say 1x, say one time also REMEMBER to use replacement words to a word so it is appropriate to the whole sentence example:
|
||||
WRONG:
|
||||
Chciałbym zamówić jedną Margherittę, dwie Colę, pięć Fant i jedną Sprite.
|
||||
RIGHT:
|
||||
Chciałbym zamówić jedną Margarittę, dwie Kole, pięć Fant i jednego Sprajta.
|
||||
|
||||
If the call ends, say at the end of your final response "CALLEND"
|
||||
@@ -6,9 +6,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tts_stream import TTSstream\n",
|
||||
"from tts_stream import TTSStream\n",
|
||||
"\n",
|
||||
"tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
|
||||
"tts = TTSStream(speaker_wav=\"voices/lector.wav\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -27,7 +27,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznej sieci neuronowej.\")\n"
|
||||
"tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznych sieci neuronowych.\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import os
|
||||
import torch
|
||||
import pyaudio
|
||||
from TTS.api import TTS
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
@@ -7,27 +6,35 @@ from TTS.tts.models.xtts import Xtts
|
||||
from TTS.utils.generic_utils import get_user_data_dir
|
||||
import threading
|
||||
import time
|
||||
import re
|
||||
|
||||
|
||||
# Check if CUDA is available
|
||||
if torch.cuda.is_available():
|
||||
print("Using CUDA")
|
||||
device = "cuda"
|
||||
else:
|
||||
print("Using CPU")
|
||||
device = "cpu"
|
||||
|
||||
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
||||
|
||||
|
||||
class TTSstream:
|
||||
def __init__(self, speaker_wav):
|
||||
class TTSStream:
|
||||
def __init__(self, speaker_wav=None, device=None):
|
||||
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
||||
|
||||
if device is None:
|
||||
import torch
|
||||
|
||||
# Check if CUDA is available
|
||||
if torch.cuda.is_available():
|
||||
print("Using CUDA")
|
||||
device = "cuda"
|
||||
else:
|
||||
print("Using CPU")
|
||||
device = "cpu"
|
||||
|
||||
#print(model_path)
|
||||
|
||||
print("Loading TTS model... ", end="")
|
||||
#
|
||||
# download model if it doesn't exist
|
||||
if not os.path.exists(os.path.join(model_path, "config.json")):
|
||||
print("Downloading model...")
|
||||
print("Downloading model... ", end="")
|
||||
tts = TTS()
|
||||
tts.download_model_by_name(model_name=model_name)
|
||||
|
||||
@@ -43,11 +50,16 @@ class TTSstream:
|
||||
)
|
||||
self.model.to(device)
|
||||
|
||||
print("Done!")
|
||||
|
||||
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
|
||||
if speaker_wav is not None:
|
||||
#self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
|
||||
self.change_speaker(speaker_wav)
|
||||
|
||||
def change_speaker(self, speaker_wav):
|
||||
print("Loading speaker... ", end="")
|
||||
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
|
||||
print("Done!")
|
||||
|
||||
def _write_stream(self):
|
||||
# play first play_buffer_size samples and remove them from the buffer
|
||||
@@ -58,6 +70,7 @@ class TTSstream:
|
||||
self.stream.write(self.chunk)
|
||||
else:
|
||||
if self.all_done:
|
||||
#self.thread_ended = True
|
||||
break
|
||||
time.sleep(0.01)
|
||||
|
||||
@@ -65,42 +78,60 @@ class TTSstream:
|
||||
def tts_speak(self, text):
|
||||
self.play_buffer_size = 512
|
||||
|
||||
chunks = self.model.inference_stream(
|
||||
text,
|
||||
"pl",
|
||||
self.gpt_cond_latent,
|
||||
self.speaker_embedding,
|
||||
stream_chunk_size=20,
|
||||
)
|
||||
|
||||
|
||||
# open pyaudio stream
|
||||
p = pyaudio.PyAudio()
|
||||
self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True)
|
||||
|
||||
# for each sentence ending with . or ! or ?
|
||||
for text in re.split(r"(?<=[.!?])", text):
|
||||
text = text.strip()
|
||||
|
||||
self.chunks_bin = b""
|
||||
self.all_done = False
|
||||
|
||||
# run write_stream as thread
|
||||
thread = threading.Thread(target=self._write_stream)
|
||||
thread.start()
|
||||
|
||||
while True:
|
||||
try:
|
||||
# read chunks from chunks generator as they are generated
|
||||
for self.chunk in chunks:
|
||||
self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
|
||||
break
|
||||
# some weird error caused by coqui-tts
|
||||
except:
|
||||
print("Error occured when generating audio stream. Retrying...")
|
||||
if len(text) == 0:
|
||||
continue
|
||||
|
||||
self.all_done = True
|
||||
chunks = self.model.inference_stream(
|
||||
text,
|
||||
"pl",
|
||||
self.gpt_cond_latent,
|
||||
self.speaker_embedding,
|
||||
stream_chunk_size=20,
|
||||
)
|
||||
|
||||
# wait for thread to finish
|
||||
thread.join()
|
||||
|
||||
self.chunks_bin = b""
|
||||
self.all_done = False
|
||||
|
||||
# run write_stream as thread
|
||||
#self.thread_ended = False
|
||||
thread = threading.Thread(target=self._write_stream)
|
||||
thread.start()
|
||||
|
||||
while True:
|
||||
try:
|
||||
# read chunks from chunks generator as they are generated
|
||||
for self.chunk in chunks:
|
||||
self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
|
||||
break
|
||||
# some weird error caused by coqui-tts
|
||||
except:
|
||||
print("Error occured when generating audio stream. Retrying...")
|
||||
continue
|
||||
|
||||
self.all_done = True
|
||||
|
||||
# wait for thread to finish
|
||||
thread.join()
|
||||
|
||||
# wait for thread ended
|
||||
#while not self.thread_ended:
|
||||
# time.sleep(0.01)
|
||||
|
||||
#while True:
|
||||
# if self.thread_ended:
|
||||
# break
|
||||
# print("Waiting for thread to end...")
|
||||
# time.sleep(0.01)
|
||||
|
||||
self.stream.close()
|
||||
p.terminate()
|
||||
|
||||
152
frontend/vad_recorder.py
Normal file
152
frontend/vad_recorder.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import torch
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
import time
|
||||
import onnxruntime as ort
|
||||
import threading
|
||||
|
||||
ort.set_default_logger_severity(3)
|
||||
|
||||
|
||||
SAMPLERATE = 16000
|
||||
|
||||
class VADRecorder:
|
||||
#def __init__(self, target_device_name, window_size_sec = 0.2, use_onnx = True):
|
||||
def __init__(self, use_onnx = True):
|
||||
|
||||
print("Loading Silero VAD model... ", end="")
|
||||
|
||||
self.vad_model, utils = torch.hub.load(
|
||||
repo_or_dir="snakers4/silero-vad",
|
||||
model="silero_vad",
|
||||
force_reload=False,
|
||||
onnx=use_onnx
|
||||
)
|
||||
|
||||
(
|
||||
_, # get_speech_timestamps
|
||||
_, # save_audio
|
||||
_, # read_audio
|
||||
self.VADIterator,
|
||||
_ # collect_chunks
|
||||
) = utils
|
||||
|
||||
print("Done!")
|
||||
|
||||
self.vad_iterator = None
|
||||
|
||||
|
||||
|
||||
def _vad_recorder(self):
|
||||
print("Listening...")
|
||||
|
||||
speech_win = 0
|
||||
detected_audio = []
|
||||
|
||||
last_chunk = np.zeros(self.window_size, dtype=np.float32)
|
||||
|
||||
# Vad iterator needs to be reloaded because after running for a while, it freaks out and hallucinates speech.
|
||||
vad_iter_reload_delay = 60 * 2
|
||||
vad_iter_load_time = time.time()
|
||||
|
||||
|
||||
self.vad_iterator = self.VADIterator(
|
||||
self.vad_model,
|
||||
threshold = self.vad_threshold,
|
||||
sampling_rate = SAMPLERATE,
|
||||
min_silence_duration_ms = self.min_silence_duration_ms,
|
||||
speech_pad_ms = self.speech_pad_ms
|
||||
)
|
||||
|
||||
|
||||
while self.rec_flag:
|
||||
chunk = np.frombuffer(self.stream_in.read(self.window_size), dtype=np.float32)
|
||||
|
||||
speech_dict = self.vad_iterator(chunk)
|
||||
|
||||
# check if speech_dict is {"start": x} ir {"end": x}
|
||||
if speech_dict is not None:
|
||||
self.speech = "start" in speech_dict
|
||||
|
||||
if self.speech:
|
||||
#print("Speech detected!")
|
||||
if speech_win == 0:
|
||||
detected_audio = last_chunk.tolist()
|
||||
speech_win += 1
|
||||
detected_audio += chunk.tolist()
|
||||
|
||||
else:
|
||||
if time.time() - vad_iter_load_time > vad_iter_reload_delay:
|
||||
self.vad_iterator.reset_states()
|
||||
|
||||
vad_iter_load_time = time.time()
|
||||
|
||||
self.vad_iterator = self.VADIterator(
|
||||
self.vad_model,
|
||||
threshold = self.vad_threshold,
|
||||
sampling_rate = SAMPLERATE,
|
||||
min_silence_duration_ms = self.min_silence_duration_ms,
|
||||
speech_pad_ms = self.speech_pad_ms
|
||||
)
|
||||
|
||||
print("Reloaded VADIterator!")
|
||||
|
||||
if speech_win > 0:
|
||||
speech_win = 0
|
||||
|
||||
self.audios_for_whisper.append(detected_audio)
|
||||
|
||||
last_chunk = chunk.copy()
|
||||
|
||||
|
||||
|
||||
|
||||
#def start_vad_recorder(self, target_device_name, window_size_sec = 0.1, vad_threshold = 0.6, min_silence_duration_ms = 150, speech_pad_ms = 0):
|
||||
def start_vad_recorder(self, window_size_sec = 0.1, vad_threshold = 0.6, min_silence_duration_ms = 150, speech_pad_ms = 0):
|
||||
|
||||
self.window_size = int(window_size_sec * SAMPLERATE)
|
||||
|
||||
self.vad_threshold = vad_threshold
|
||||
self.min_silence_duration_ms = min_silence_duration_ms
|
||||
self.speech_pad_ms = speech_pad_ms
|
||||
|
||||
|
||||
self.p = pyaudio.PyAudio()
|
||||
|
||||
#target_device_index = None
|
||||
#for i in range(self.p.get_device_count()):
|
||||
# device_info = self.p.get_device_info_by_index(i)
|
||||
# if device_info['maxInputChannels'] > 0 and target_device_name in device_info['name']:
|
||||
# target_device_index = i
|
||||
# break
|
||||
#
|
||||
#if target_device_index is None:
|
||||
# print(f"No target device found with \"{target_device_name}\" in its name.")
|
||||
# exit()
|
||||
#
|
||||
#try:
|
||||
# self.stream_in = self.p.open(format=pyaudio.paFloat32, channels=1, rate=SAMPLERATE, input=True, frames_per_buffer=self.window_size, input_device_index=target_device_index)
|
||||
#except OSError:
|
||||
# print(f"An unexpected error occured when trying to open device stream with \"{target_device_name}\" in its name. That could be caused by the device being disabled or unplugged.")
|
||||
# exit()
|
||||
|
||||
self.stream_in = self.p.open(format=pyaudio.paFloat32, channels=1, rate=SAMPLERATE, input=True, frames_per_buffer=self.window_size)
|
||||
|
||||
self.speech = False
|
||||
self.audios_for_whisper = []
|
||||
|
||||
|
||||
if self.vad_iterator is not None:
|
||||
self.vad_iterator.reset_states()
|
||||
|
||||
self.rec_flag = True
|
||||
self.vad_rec_thread = threading.Thread(target=self._vad_recorder, daemon=True)
|
||||
self.vad_rec_thread.start()
|
||||
|
||||
def stop_vad_recorder(self):
|
||||
self.rec_flag = False
|
||||
self.vad_rec_thread.join()
|
||||
|
||||
self.stream_in.stop_stream()
|
||||
self.stream_in.close()
|
||||
self.p.terminate()
|
||||
69
frontend/vad_recorder_test.ipynb
Normal file
69
frontend/vad_recorder_test.ipynb
Normal file
@@ -0,0 +1,69 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from vad_recorder import VADRecorder\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"# Instantiate WhisperWrap with the target device name and other parameters\n",
|
||||
"vad_rec = VADRecorder()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Starting VAD recording thread\")\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" vad_rec.start_vad_recorder(target_device_name=\"Virtual\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" recordings_count = 0\n",
|
||||
" now_speech = False\n",
|
||||
"\n",
|
||||
" print(\"Done!\")\n",
|
||||
" while True:\n",
|
||||
" if vad_rec.speech != now_speech:\n",
|
||||
" now_speech = vad_rec.speech\n",
|
||||
" print(f\"Speech: {now_speech}\")\n",
|
||||
" if len(vad_rec.audios_for_whisper) != recordings_count:\n",
|
||||
" recordings_count = len(vad_rec.audios_for_whisper)\n",
|
||||
" print(f\"Recordings count: {recordings_count}\")\n",
|
||||
"\n",
|
||||
" time.sleep(0.01)\n",
|
||||
"\n",
|
||||
"except (KeyboardInterrupt, SystemExit):\n",
|
||||
" print(\"Cleaning up...\")\n",
|
||||
" vad_rec.stop_vad_recorder()\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,126 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import whisper\n",
|
||||
"from tts_stream import TTSstream"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sounddevice as sd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"# force matplotlib gui backend\n",
|
||||
"import matplotlib\n",
|
||||
"matplotlib.use('TkAgg')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Loading whisper model...\")\n",
|
||||
"\n",
|
||||
"if torch.cuda.is_available():\n",
|
||||
" print(\"using CUDA\")\n",
|
||||
" device = \"cuda\"\n",
|
||||
"else:\n",
|
||||
" print(\"using CPU\")\n",
|
||||
" device = \"cpu\"\n",
|
||||
"\n",
|
||||
"model = whisper.load_model(\"medium\").to(device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# optional for changing speaker to some another one\n",
|
||||
"tts.change_speaker(\"voices/speaker_name.wav\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# record 10 seconds of audio with sounddevice\n",
|
||||
"print(\"Recording audio...\")\n",
|
||||
"\n",
|
||||
"fs = 16000\n",
|
||||
"duration = 4\n",
|
||||
"frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n",
|
||||
"sd.wait()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"frames = frames[:, 0]\n",
|
||||
"#frames /= np.max(np.abs(frames))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## plot audio\n",
|
||||
"#plt.plot(frames)\n",
|
||||
"## set plot range to -1, 1\n",
|
||||
"#plt.ylim(-1, 1)\n",
|
||||
"#plt.show()\n",
|
||||
"\n",
|
||||
"# recognize text from audio\n",
|
||||
"print(\"Recognizing text...\")\n",
|
||||
"\n",
|
||||
"result = model.transcribe(frames, language=\"pl\", fp16=False)\n",
|
||||
"whisper_text = result[\"text\"]\n",
|
||||
"print(whisper_text)\n",
|
||||
"\n",
|
||||
"# synthesize text to audio\n",
|
||||
"print(\"Synthesizing audio...\")\n",
|
||||
"tts.tts_speak(whisper_text)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
68
udp raw audio player/udp raw audio stream player.py
Normal file
68
udp raw audio player/udp raw audio stream player.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import socket
|
||||
import psutil
|
||||
import pyaudio
|
||||
|
||||
# This streamer can be used to preview assistant audio for example for presentation.
|
||||
# For this you have to set OBS to following ffmpeg streaming settings:
|
||||
# 48khz
|
||||
# stereo
|
||||
# PCM signed 16bit little endian
|
||||
|
||||
|
||||
|
||||
# get local ip adress of main network interface
|
||||
interfaces = psutil.net_if_addrs()
|
||||
for interface, addrs in interfaces.items():
|
||||
if "Wi-Fi" in interface:
|
||||
print("name: ", interface)
|
||||
for addr in addrs:
|
||||
if addr.family == socket.AF_INET:
|
||||
UDP_IP = addr.address
|
||||
print("ip: ", UDP_IP)
|
||||
break
|
||||
break
|
||||
|
||||
UDP_PORT = 8081
|
||||
|
||||
print("port: ", UDP_PORT)
|
||||
|
||||
# PyAudio parameters
|
||||
CHANNELS = 2
|
||||
|
||||
# 16-bit audio
|
||||
WIDTH = 2
|
||||
RATE = 48000
|
||||
|
||||
BUFFER_SIZE = 2048
|
||||
|
||||
|
||||
# Create a UDP socket
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
sock.bind((UDP_IP, UDP_PORT))
|
||||
|
||||
# Initialize PyAudio
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(format=p.get_format_from_width(WIDTH),
|
||||
channels=CHANNELS,
|
||||
rate=RATE,
|
||||
output=True)
|
||||
|
||||
print("Stream started. Press Ctrl+C to stop.")
|
||||
|
||||
try:
|
||||
while True:
|
||||
# Receive audio data
|
||||
data, addr = sock.recvfrom(BUFFER_SIZE)
|
||||
|
||||
# Play the audio data
|
||||
stream.write(data)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nStreaming stopped.")
|
||||
|
||||
finally:
|
||||
# Close the stream and socket
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
sock.close()
|
||||
Reference in New Issue
Block a user