support more graphemes

2024-12-13 17:33:32 +01:00 · 2024-12-13 17:33:32 +01:00 · b38d4f2430
commit b38d4f2430
parent 2e28ce053f
1 changed files with 25 additions and 9 deletions
--- a/src/main.rs
+++ b/src/main.rs
@ -1,6 +1,18 @@
 use std::env::args;

-const KNOWN_GRAPHEMES: [&str; 10] = ["a", "i", "u", "e", "o", "ka", "ki", "ku", "ke", "ko"];
+// TODO: add "n" and Digraphs
+const KNOWN_GRAPHEMES: [&str; 73] = [
+    "a", "i", "u", "e", "o", "ka", "ki", "ku", "ke", "ko", "ga", "gi", "gu", "ge", "go", "sa",
+    "shi", "su", "se", "so", "za", "ji", "zu", "ze", "zo", "ta", "chi", "tsu", "te", "to", "da",
+    "ji", "zu", "de", "do", "na", "ni", "nu", "ne", "no", "ha", "hi", "fu", "he", "ho", "ba", "bi",
+    "bu", "be", "bo", "pa", "pi", "pu", "pe", "po", "ma", "mi", "mu", "me", "mo", "ya", "yu", "ye",
+    "yo", "ra", "ri", "ru", "re", "ro", "wa", "wi", "we", "wo",
+];
+
+// convert unicode japanese into the latin alphabet, which is necessary in order to use split_word()
+fn romanize(word: String) -> String {
+    todo!();
+}

 // look at the first letter, if it exists in KNOWN_GRAPHEMES, then add to graphemes
 // else look at the first and second letter, if that exists in KNOWN_GRAPHEMES, then add that to grapehemes
@ -14,19 +26,20 @@ fn split_word(word: String) -> Vec<String> {
            Some(val) => String::from(val),
            None => break,
        };
-        dbg!(&grapheme);

-        let temp = grapheme.as_str();
-
-        if KNOWN_GRAPHEMES.contains(&temp) {
+        // TODO: figure out how to deal with 'n'
+        if KNOWN_GRAPHEMES.contains(&grapheme.as_str()) {
            graphemes.push(grapheme);
        } else {
-            grapheme.push(word_chars.next().unwrap());
+            match word_chars.next() {
+                Some(val) => grapheme.push(val),
+                None => panic!("word contains non japanese grapheme: {grapheme}"),
+            };

-            let temp = grapheme.as_str();
-
-            if KNOWN_GRAPHEMES.contains(&temp) {
+            if KNOWN_GRAPHEMES.contains(&grapheme.as_str()) {
                graphemes.push(grapheme);
+            } else {
+                panic!("word contains non japanese grapheme: {grapheme}");
            }
        }
    }
@ -48,6 +61,9 @@ fn main() {
        None => panic!("how?!"),
    };

+    // TODO: sanitise even further. check if input contains any illegal chars
+    // input should only contain a-z, A-Z or in the future Unicode japanese chars
+
    println!("Sanitised input: {}", input);

    let graphemes = split_word(input);