From 89dc0009278b4fdfb115742094e469057b78154e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Mangano?= <fmang@mg0.fr>
Date: Fri, 3 Mar 2023 11:22:18 +0900
Subject: [PATCH] Rework the encoding converter to support std::u8string

---
 src/cli.cc     |  3 ---
 src/opustags.h | 26 ++++++--------------
 src/system.cc  | 64 +++++++++++++++++++++++++++++++++++++++++++-------
 t/system.cc    | 12 +++++-----
 4 files changed, 68 insertions(+), 37 deletions(-)
diff --git a/src/cli.cc b/src/cli.cc
index bc9a3d5..1f3bb4a 100644
--- a/src/cli.cc
+++ b/src/cli.cc
@@ -63,7 +63,6 @@ static struct option getopt_options[] = {
 ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 {
 	options opt;
-	static ot::encoding_converter to_utf8("", "UTF-8");
 	const char* equal;
 	ot::status rc;
 	bool set_all = false;
@@ -230,7 +229,6 @@ static std::string format_value(const std::string& source)
  */
 void ot::print_comments(const std::list<std::string>& comments, FILE* output, bool raw)
 {
-	static ot::encoding_converter from_utf8("UTF-8", "");
 	std::string local;
 	bool has_control = false;
 	for (const std::string& source_comment : comments) {
@@ -268,7 +266,6 @@ void ot::print_comments(const std::list<std::string>& comments, FILE* output, bo
 std::list<std::string> ot::read_comments(FILE* input, bool raw)
 {
 	std::list<std::string> comments;
-	static ot::encoding_converter to_utf8("", "UTF-8");
 	comments.clear();
 	char* source_line = nullptr;
 	size_t buflen = 0;
diff --git a/src/opustags.h b/src/opustags.h
index 36614b0..cfb80d8 100644
--- a/src/opustags.h
+++ b/src/opustags.h
@@ -160,25 +160,13 @@ private:
 /** Read a whole file into memory and return the read content. */
 byte_string slurp_binary_file(const char* filename);
 
-/** C++ wrapper for iconv. */
-class encoding_converter {
-public:
-	/**
-	 * Allocate the iconv conversion state, initializing the given source and destination
-	 * character encodings. If it's okay to have some information lost, make sure `to` ends with
-	 * "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
-	 * in the target encoding. See the documentation of iconv_open for details.
-	 */
-	encoding_converter(const char* from, const char* to);
-	~encoding_converter();
-	/**
-	 * Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
-	 * abort the processing, leaving out in an undefined state.
-	 */
-	std::string operator()(std::string_view in);
-private:
-	iconv_t cd; /**< conversion descriptor */
-};
+/** Convert a string from the system locale’s encoding to UTF-8. */
+std::u8string encode_utf8(std::string_view);
+std::string to_utf8(std::string_view); ///< \deprecated
+
+/** Convert a string from UTF-8 to the system locale’s encoding. */
+std::string decode_utf8(std::u8string_view);
+std::string from_utf8(std::string_view); ///< \deprecated
 
 /** Escape a string so that a POSIX shell interprets it as a single argument. */
 std::string shell_escape(std::string_view word);
diff --git a/src/system.cc b/src/system.cc
index 6d26703..431e040 100644
--- a/src/system.cc
+++ b/src/system.cc
@@ -143,24 +143,47 @@ ot::byte_string ot::slurp_binary_file(const char* filename)
 	return content;
 }
 
-ot::encoding_converter::encoding_converter(const char* from, const char* to)
+/** C++ wrapper for iconv. */
+class encoding_converter {
+public:
+	/**
+	 * Allocate the iconv conversion state, initializing the given source and destination
+	 * character encodings. If it's okay to have some information lost, make sure `to` ends with
+	 * "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
+	 * in the target encoding. See the documentation of iconv_open for details.
+	 */
+	encoding_converter(const char* from, const char* to);
+	~encoding_converter();
+	/**
+	 * Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
+	 * abort the processing, leaving out in an undefined state.
+	 */
+	template<class InChar, class OutChar>
+	std::basic_string<OutChar> convert(std::basic_string_view<InChar>);
+private:
+	iconv_t cd; /**< conversion descriptor */
+};
+
+encoding_converter::encoding_converter(const char* from, const char* to)
 {
 	cd = iconv_open(to, from);
 	if (cd == (iconv_t) -1)
 		throw std::bad_alloc();
 }
 
-ot::encoding_converter::~encoding_converter()
+encoding_converter::~encoding_converter()
 {
 	iconv_close(cd);
 }
 
-std::string ot::encoding_converter::operator()(std::string_view in)
+template<class InChar, class OutChar>
+std::basic_string<OutChar> encoding_converter::convert(std::basic_string_view<InChar> in)
 {
 	iconv(cd, nullptr, nullptr, nullptr, nullptr);
-	std::string out;
+	std::basic_string<OutChar> out;
 	out.reserve(in.size());
-	char* in_cursor = const_cast<char*>(in.data());
+	const char* in_data = reinterpret_cast<const char*>(in.data());
+	char* in_cursor = const_cast<char*>(in_data);
 	size_t in_left = in.size();
 	constexpr size_t chunk_size = 1024;
 	char chunk[chunk_size];
@@ -172,13 +195,13 @@ std::string ot::encoding_converter::operator()(std::string_view in)
 		if (rc == (size_t) -1 && errno == E2BIG) {
 			// Loop normally.
 		} else if (rc == (size_t) -1) {
-			throw status {ot::st::badly_encoded, strerror(errno) + "."s};
+			throw ot::status {ot::st::badly_encoded, strerror(errno) + "."s};
 		} else if (rc != 0) {
-			throw status {ot::st::badly_encoded,
-			              "Some characters could not be converted into the target encoding."};
+			throw ot::status {ot::st::badly_encoded,
+			                 "Some characters could not be converted into the target encoding."};
 		}
 
-		out.append(chunk, out_cursor - chunk);
+		out.append(reinterpret_cast<OutChar*>(chunk), out_cursor - chunk);
 		if (in_cursor == nullptr)
 			break;
 		else if (in_left == 0)
@@ -187,6 +210,29 @@ std::string ot::encoding_converter::operator()(std::string_view in)
 	return out;
 }
 
+static encoding_converter to_utf8_cvt("", "UTF-8");
+static encoding_converter from_utf8_cvt("UTF-8", "");
+
+std::u8string ot::encode_utf8(std::string_view in)
+{
+	return to_utf8_cvt.convert<char, char8_t>(in);
+}
+
+std::string ot::to_utf8(std::string_view in)
+{
+	return to_utf8_cvt.convert<char, char>(in);
+}
+
+std::string ot::decode_utf8(std::u8string_view in)
+{
+	return from_utf8_cvt.convert<char8_t, char>(in);
+}
+
+std::string ot::from_utf8(std::string_view in)
+{
+	return from_utf8_cvt.convert<char, char>(in);
+}
+
 std::string ot::shell_escape(std::string_view word)
 {
 	std::string escaped_word;
diff --git a/t/system.cc b/t/system.cc
index f0b0e0a..c8deae7 100644
--- a/t/system.cc
+++ b/t/system.cc
@@ -48,15 +48,15 @@ void check_slurp()
 
 void check_converter()
 {
-	const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65";
-	ot::encoding_converter to_utf8("ISO_8859-1", "UTF-8");
-	ot::encoding_converter from_utf8("UTF-8", "ISO_8859-1");
+	is(ot::from_utf8(ot::to_utf8("Éphémère")), "Éphémère", "from_utf8 reverts to_utf8");
+	is(ot::to_utf8(ot::from_utf8("Éphémère")), "Éphémère", "to_utf8 reverts from_utf8");
 
-	is(to_utf8(ephemere_iso), "Éphémère", "conversion to UTF-8 is correct");
-	is(from_utf8("Éphémère"), ephemere_iso, "conversion from UTF-8 is correct");
+	is(ot::decode_utf8(ot::encode_utf8("Éphémère")), "Éphémère", "decode_utf8 reverts encode_utf8");
+	opaque_is(ot::encode_utf8(ot::decode_utf8(u8"Éphémère")), u8"Éphémère",
+	          "encode_utf8 reverts decode_utf8");
 
 	try {
-		from_utf8("\xFF\xFF");
+		ot::from_utf8("\xFF\xFF");
 		throw failure("conversion from bad UTF-8 did not fail");
 	} catch (const ot::status&) {}
 }