diff --git a/src/main.rs b/src/main.rs index 3777147..403d751 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,62 +1,20 @@ -use std::env::args; - -// TODO: add "n" and Digraphs -const KNOWN_GRAPHEMES: [&str; 73] = [ - "a", "i", "u", "e", "o", "ka", "ki", "ku", "ke", "ko", "ga", "gi", "gu", "ge", "go", "sa", +// TODO: add Digraphs +const KNOWN_GRAPHEMES: [&str; 74] = [ + "n", "a", "i", "u", "e", "o", "ka", "ki", "ku", "ke", "ko", "ga", "gi", "gu", "ge", "go", "sa", "shi", "su", "se", "so", "za", "ji", "zu", "ze", "zo", "ta", "chi", "tsu", "te", "to", "da", "ji", "zu", "de", "do", "na", "ni", "nu", "ne", "no", "ha", "hi", "fu", "he", "ho", "ba", "bi", "bu", "be", "bo", "pa", "pi", "pu", "pe", "po", "ma", "mi", "mu", "me", "mo", "ya", "yu", "ye", "yo", "ra", "ri", "ru", "re", "ro", "wa", "wi", "we", "wo", ]; -// convert unicode japanese into the latin alphabet, which is necessary in order to use split_word() -fn romanize(word: String) -> String { - todo!(); -} - -// look at the first letter, if it exists in KNOWN_GRAPHEMES, then add to graphemes -// else look at the first and second letter, if that exists in KNOWN_GRAPHEMES, then add that to grapehemes -fn split_word(word: String) -> Vec { - let mut graphemes: Vec = vec![]; - let mut word_chars = word.chars(); - let mut grapheme: String; - - loop { - grapheme = match word_chars.next() { - Some(val) => String::from(val), - None => break, - }; - - // TODO: figure out how to deal with 'n' - if KNOWN_GRAPHEMES.contains(&grapheme.as_str()) { - graphemes.push(grapheme); - } else { - match word_chars.next() { - Some(val) => grapheme.push(val), - None => panic!("word contains non japanese grapheme: {grapheme}"), - }; - - if KNOWN_GRAPHEMES.contains(&grapheme.as_str()) { - graphemes.push(grapheme); - } else { - panic!("word contains non japanese grapheme: {grapheme}"); - } - } - } - - return graphemes; -} - fn main() { - // dbg!(KNOWN_GRAPHEMES); + // TODO: loop over args in order to work with sentences - // dbg!(&args()); - - if args().len() != 2 { + if std::env::args().len() != 2 { panic!("fuck"); } - let input = match args().last() { + let mut input = match std::env::args().last() { Some(val) => val.trim().to_lowercase(), None => panic!("how?!"), }; @@ -64,9 +22,82 @@ fn main() { // TODO: sanitise even further. check if input contains any illegal chars // input should only contain a-z, A-Z or in the future Unicode japanese chars + if is_invalid(&input) { + panic!("input {} is invalid", input); + } + + if !input.is_ascii() { + input = romanize(&input); + } + println!("Sanitised input: {}", input); - let graphemes = split_word(input); + let graphemes = split_word(&input); dbg!(graphemes); } + +fn is_invalid(word: &String) -> bool { + dbg!(&word); + return false; +} + +// convert unicode japanese into the latin alphabet +// return romanized version of input +fn romanize(word: &String) -> String { + dbg!(&word); + todo!(); +} + +// This is absolute cancer +fn split_word(word: &String) -> Vec { + let mut graphemes: Vec = vec![]; + let mut word_chars = word.chars(); + + loop { + let mut grapheme: String = String::new(); + + match word_chars.next() { + Some(val) => grapheme.push(val), + None => break, + }; + + if KNOWN_GRAPHEMES.contains(&grapheme.as_str()) && grapheme != "n" { + graphemes.push(grapheme); + continue; + } + + match word_chars.next() { + Some(val) => grapheme.push(val), + _ => (), + }; + + if KNOWN_GRAPHEMES.contains(&grapheme.as_str()) { + graphemes.push(grapheme); + continue; + } + + if grapheme.starts_with('n') { + if grapheme.trim_end_matches("aiueo") == "n" { + graphemes.push(grapheme); + continue; + } else { + graphemes.push(String::from('n')); + grapheme.remove(0); + } + } + + match word_chars.next() { + Some(val) => grapheme.push(val), + _ => (), + }; + + if KNOWN_GRAPHEMES.contains(&grapheme.as_str()) { + graphemes.push(grapheme); + } else { + panic!("word contains non japanese grapheme: {grapheme}"); + } + } + + return graphemes; +}