From ebc8347c9e18c7f48c1a736b5a8b0f5b0d0b34af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Mangano-Tarumi?= <fmang@mg0.fr>
Date: Sun, 9 Dec 2018 11:45:00 -0500
Subject: [PATCH] character encoding converter

---
 src/opustags.h | 25 +++++++++++++++++++++++++
 src/system.cc  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 t/system.cc    | 24 +++++++++++++++++++++++-
 3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/src/opustags.h b/src/opustags.h
index 37b82c7..2ae5ce0 100644
--- a/src/opustags.h
+++ b/src/opustags.h
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include <iconv.h>
 #include <ogg/ogg.h>
 #include <stdio.h>
 
@@ -55,6 +56,9 @@ enum class st {
 	error,
 	standard_error, /**< Error raised by the C standard library. */
 	int_overflow,
+	/* System */
+	badly_encoded,
+	information_lost,
 	/* Ogg */
 	bad_stream,
 	end_of_stream,
@@ -133,6 +137,27 @@ private:
 	ot::file file;
 };
 
+/** C++ wrapper for iconv. */
+class encoding_converter {
+public:
+	/**
+	 * Allocate the iconv conversion state, initializing the given source and destination
+	 * character encodings. If it's okay to have some information lost, make sure `to` ends with
+	 * "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
+	 * in the target encoding. See the documentation of iconv_open for details.
+	 */
+	encoding_converter(const char* from, const char* to);
+	~encoding_converter();
+	/**
+	 * Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
+	 * abort the processing. If some character could not be converted perfectly, keep converting
+	 * the string and finally return #st::information_lost.
+	 */
+	status operator()(const std::string& in, std::string& out);
+private:
+	iconv_t cd; /**< conversion descriptor */
+};
+
 /** \} */
 
 /***********************************************************************************************//**
diff --git a/src/system.cc b/src/system.cc
index fb5396a..e62c25b 100644
--- a/src/system.cc
+++ b/src/system.cc
@@ -50,3 +50,47 @@ void ot::partial_file::abort()
 	file.reset();
 	remove(temporary_name.c_str());
 }
+
+ot::encoding_converter::encoding_converter(const char* from, const char* to)
+{
+	cd = iconv_open(to, from);
+	if (cd == (iconv_t) -1)
+		throw std::bad_alloc();
+}
+
+ot::encoding_converter::~encoding_converter()
+{
+	iconv_close(cd);
+}
+
+ot::status ot::encoding_converter::operator()(const std::string& in, std::string& out)
+{
+	iconv(cd, nullptr, nullptr, nullptr, nullptr);
+	out.clear();
+	out.reserve(in.size());
+	char* in_cursor = const_cast<char*>(in.data());
+	size_t in_left = in.size();
+	constexpr size_t chunk_size = 1024;
+	char chunk[chunk_size];
+	bool lost_information = false;
+	for (;;) {
+		char *out_cursor = chunk;
+		size_t out_left = chunk_size;
+		size_t rc = iconv(cd, &in_cursor, &in_left, &out_cursor, &out_left);
+		if (rc == (size_t) -1 && errno != E2BIG)
+			return {ot::st::badly_encoded,
+			        "Could not convert string '" + in + "': " + strerror(errno)};
+		if (rc != 0)
+			lost_information = true;
+		out.append(chunk, out_cursor - chunk);
+		if (in_cursor == nullptr)
+			break;
+		else if (in_left == 0)
+			in_cursor = nullptr;
+	}
+	if (lost_information)
+		return {ot::st::information_lost,
+		        "Some characters could not be converted into the target encoding "
+		        "in string '" + in + "'."};
+	return ot::st::ok;
+}
diff --git a/t/system.cc b/t/system.cc
index c1e604b..204b93e 100644
--- a/t/system.cc
+++ b/t/system.cc
@@ -36,9 +36,31 @@ void check_partial_files()
 		throw failure("could not remove the result file");
 }
 
+void check_converter()
+{
+	const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65";
+	ot::encoding_converter to_utf8("ISO-8859-1", "UTF-8");
+	std::string out;
+	ot::status rc = to_utf8(ephemere_iso, out);
+	if (rc != ot::st::ok || out != "Éphémère")
+		throw failure("conversion to UTF-8 should have worked");
+
+	ot::encoding_converter from_utf8("UTF-8", "ISO-8859-1//TRANSLIT");
+	rc = from_utf8("Éphémère", out);
+	if (rc != ot::st::ok || out != ephemere_iso)
+		throw failure("conversion from UTF-8 should have worked");
+	rc = from_utf8("\xFF\xFF", out);
+	if (rc != ot::st::badly_encoded)
+		throw failure("conversion from bad UTF-8 should have failed");
+	rc = from_utf8("cat 猫 chat", out);
+	if (rc != ot::st::information_lost || out != "cat ? chat")
+		throw failure("lossy conversion from UTF-8 should have worked");
+}
+
 int main(int argc, char **argv)
 {
-	std::cout << "1..1\n";
+	std::cout << "1..2\n";
 	run(check_partial_files, "test partial files");
+	run(check_converter, "test encoding converter");
 	return 0;
 }