big tts and speech recognition update

2023-11-15 19:57:17 +01:00
parent c87486728d
commit 6a24bee99b
6 changed files with 295 additions and 1 deletions
--- a/frontend/whisper_tts_test.ipynb
+++ b/frontend/whisper_tts_test.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import whisper\n",
+    "from tts_stream import TTSstream"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sounddevice as sd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import torch\n",
+    "\n",
+    "# force matplotlib gui backend\n",
+    "import matplotlib\n",
+    "matplotlib.use('TkAgg')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Loading whisper model...\")\n",
+    "\n",
+    "if torch.cuda.is_available():\n",
+    "    print(\"using CUDA\")\n",
+    "    device = \"cuda\"\n",
+    "else:\n",
+    "    print(\"using CPU\")\n",
+    "    device = \"cpu\"\n",
+    "\n",
+    "model = whisper.load_model(\"medium\").to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# optional for changing speaker to some another one\n",
+    "tts.change_speaker(\"voices/speaker_name.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# record 10 seconds of audio with sounddevice\n",
+    "print(\"Recording audio...\")\n",
+    "\n",
+    "fs = 16000\n",
+    "duration = 4\n",
+    "frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n",
+    "sd.wait()\n",
+    "\n",
+    "\n",
+    "\n",
+    "frames = frames[:, 0]\n",
+    "#frames /= np.max(np.abs(frames))\n",
+    "\n",
+    "\n",
+    "## plot audio\n",
+    "#plt.plot(frames)\n",
+    "## set plot range to -1, 1\n",
+    "#plt.ylim(-1, 1)\n",
+    "#plt.show()\n",
+    "\n",
+    "# recognize text from audio\n",
+    "print(\"Recognizing text...\")\n",
+    "\n",
+    "result = model.transcribe(frames, language=\"pl\", fp16=False)\n",
+    "whisper_text = result[\"text\"]\n",
+    "print(whisper_text)\n",
+    "\n",
+    "# synthesize text to audio\n",
+    "print(\"Synthesizing audio...\")\n",
+    "tts.tts_speak(whisper_text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}