From 2b20e5ece972e3e994f907203590e64a8d7159d3 Mon Sep 17 00:00:00 2001 From: AustrianToast Date: Fri, 25 Oct 2024 00:58:30 +0200 Subject: [PATCH] small progress on speech synthesis research --- .obsidian/workspace.json | 8 ++++---- Speech synthesis.md | 35 ++++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/.obsidian/workspace.json b/.obsidian/workspace.json index 6083674..a5c502d 100644 --- a/.obsidian/workspace.json +++ b/.obsidian/workspace.json @@ -13,12 +13,12 @@ "state": { "type": "markdown", "state": { - "file": "Reasons why I use Obsidian.md", + "file": "Speech synthesis.md", "mode": "source", "source": false }, "icon": "lucide-file", - "title": "Reasons why I use Obsidian" + "title": "Speech synthesis" } } ] @@ -163,12 +163,12 @@ }, "active": "8b669fe838f51401", "lastOpenFiles": [ + "Satan.md", "Framework.md", + "Reasons why I use Obsidian.md", "Beehive.md", "Speech synthesis.md", "Updating.md", - "Satan.md", - "Reasons why I use Obsidian.md", "Knowledge Base/labwc.md", "Knowledge Base", "Hentai.md", diff --git a/Speech synthesis.md b/Speech synthesis.md index 16f2011..781b268 100644 --- a/Speech synthesis.md +++ b/Speech synthesis.md @@ -1,16 +1,37 @@ ## Goal The goal is to write a text to speech program, where all sounds are purely created through mathematical equations, in order to create my own true 100% machine speech synthesis. +## Inspirations +These inspirations include some newer products and some very old software solutions which are practically impossible to obtain and as such have been almost completely forgotten. +### [MUSIC-N](https://en.wikipedia.org/wiki/MUSIC-N) +It was used to create [this famous piece of music](https://www.youtube.com/watch?v=41U78QP8nBk). If my guess is correct, this should be made using Formant synthesis. Since this is a product of Bell Labs and because of it's age, it probably lives in an archive somewhere or maybe has even been lost. +### [gnuspeech](https://savannah.gnu.org/projects/gnuspeech) +It makes use of Articulatory synthesis and is an open source project. Sadly it only runs on extremely old versions of Mac OS X and on the BSD-based NeXTSTEP, which is closed source and as such practically impossible to get. These factors have led to gnuspeech being unmaintained and mostly forgotten. +### [UTAU](http://utau2008.web.fc2.com/) +I really like Utane Uta (Defoko) and Adachi Rei. Both of them are completely synthesised and make use of no Voicebanks. +### [Vocaloid](https://www.vocaloid.com/en/) +Not the kind of speech synthesis I am going for, but I still really like them. + +### SAM +I currently cannot easily figure out which type of synthesis it is even after taking a short look at the source code of [this newer implementation](https://github.com/s-macke/SAM). My current guess is, that it is using some form of Concatenation synthesis, since it is combining phonemes to create words. + ## Planning -### Language +### Spoken Language The language is one of the most important aspects of this program and as such needs to be carefully chosen. The possible languages from which I wanted to choose from where German, English or Japanese. In order to help me which of those three languages I should choose, I came up with these requirements. -- Small alphabet in order to minimise the amount of code required -- Individual letters stitched into words together require no further processing in order to be understandable +- Small alphabet in order to minimise the amount of work required +- Individual letters stitched into words together should require no further processing in order to be understandable - The words need to be understandable even without proper or any emphasising at all -I speak German and English fluently and they have a relatively small alphabet. In the end, I chose Japanese, because it may have a way bigger alphabet, but at least it is consistent in the way different combinations of letters create words. +Since I speak German and English fluently and they have a relatively small alphabet, I chose to do both, but I am going to start with English first. ### Programming Language -There are a many options, but to help narrow it down, I came up with these requirements. -- functional programming (pure functions) -- compiles to ELF \ No newline at end of file +There are a many options, but to help narrow it down, I came up with a few requirements. +- should be functional (Haskell for example) or procedural (C for example) +- compiles to binary (should require no interpreter, so no python or java for example) + +### What type of speech synthesis +When I looked on [Wikipedia](https://en.wikipedia.org/wiki/Speech_synthesis#Synthesizer_technologies) I saw that there are multiple types of speech synthesis. +Since I am restricting myself to pure machine synthesis, I can only choose between two types. +- Formant synthesis +- Articulatory synthesis +I really wanna do both, but I am going to start with Articulatory synthesis. I am only doing it first, because it seems way cooler to me. \ No newline at end of file