From ebc8347c9e18c7f48c1a736b5a8b0f5b0d0b34af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Mangano-Tarumi?= <fmang@mg0.fr> Date: Sun, 9 Dec 2018 11:45:00 -0500 Subject: [PATCH] character encoding converter --- src/opustags.h | 25 +++++++++++++++++++++++++ src/system.cc | 44 ++++++++++++++++++++++++++++++++++++++++++++ t/system.cc | 24 +++++++++++++++++++++++- 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/src/opustags.h b/src/opustags.h index 37b82c7..2ae5ce0 100644 --- a/src/opustags.h +++ b/src/opustags.h @@ -23,6 +23,7 @@ #pragma once +#include <iconv.h> #include <ogg/ogg.h> #include <stdio.h> @@ -55,6 +56,9 @@ enum class st { error, standard_error, /**< Error raised by the C standard library. */ int_overflow, + /* System */ + badly_encoded, + information_lost, /* Ogg */ bad_stream, end_of_stream, @@ -133,6 +137,27 @@ private: ot::file file; }; +/** C++ wrapper for iconv. */ +class encoding_converter { +public: + /** + * Allocate the iconv conversion state, initializing the given source and destination + * character encodings. If it's okay to have some information lost, make sure `to` ends with + * "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented + * in the target encoding. See the documentation of iconv_open for details. + */ + encoding_converter(const char* from, const char* to); + ~encoding_converter(); + /** + * Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and + * abort the processing. If some character could not be converted perfectly, keep converting + * the string and finally return #st::information_lost. + */ + status operator()(const std::string& in, std::string& out); +private: + iconv_t cd; /**< conversion descriptor */ +}; + /** \} */ /***********************************************************************************************//** diff --git a/src/system.cc b/src/system.cc index fb5396a..e62c25b 100644 --- a/src/system.cc +++ b/src/system.cc @@ -50,3 +50,47 @@ void ot::partial_file::abort() file.reset(); remove(temporary_name.c_str()); } + +ot::encoding_converter::encoding_converter(const char* from, const char* to) +{ + cd = iconv_open(to, from); + if (cd == (iconv_t) -1) + throw std::bad_alloc(); +} + +ot::encoding_converter::~encoding_converter() +{ + iconv_close(cd); +} + +ot::status ot::encoding_converter::operator()(const std::string& in, std::string& out) +{ + iconv(cd, nullptr, nullptr, nullptr, nullptr); + out.clear(); + out.reserve(in.size()); + char* in_cursor = const_cast<char*>(in.data()); + size_t in_left = in.size(); + constexpr size_t chunk_size = 1024; + char chunk[chunk_size]; + bool lost_information = false; + for (;;) { + char *out_cursor = chunk; + size_t out_left = chunk_size; + size_t rc = iconv(cd, &in_cursor, &in_left, &out_cursor, &out_left); + if (rc == (size_t) -1 && errno != E2BIG) + return {ot::st::badly_encoded, + "Could not convert string '" + in + "': " + strerror(errno)}; + if (rc != 0) + lost_information = true; + out.append(chunk, out_cursor - chunk); + if (in_cursor == nullptr) + break; + else if (in_left == 0) + in_cursor = nullptr; + } + if (lost_information) + return {ot::st::information_lost, + "Some characters could not be converted into the target encoding " + "in string '" + in + "'."}; + return ot::st::ok; +} diff --git a/t/system.cc b/t/system.cc index c1e604b..204b93e 100644 --- a/t/system.cc +++ b/t/system.cc @@ -36,9 +36,31 @@ void check_partial_files() throw failure("could not remove the result file"); } +void check_converter() +{ + const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65"; + ot::encoding_converter to_utf8("ISO-8859-1", "UTF-8"); + std::string out; + ot::status rc = to_utf8(ephemere_iso, out); + if (rc != ot::st::ok || out != "Éphémère") + throw failure("conversion to UTF-8 should have worked"); + + ot::encoding_converter from_utf8("UTF-8", "ISO-8859-1//TRANSLIT"); + rc = from_utf8("Éphémère", out); + if (rc != ot::st::ok || out != ephemere_iso) + throw failure("conversion from UTF-8 should have worked"); + rc = from_utf8("\xFF\xFF", out); + if (rc != ot::st::badly_encoded) + throw failure("conversion from bad UTF-8 should have failed"); + rc = from_utf8("cat 猫 chat", out); + if (rc != ot::st::information_lost || out != "cat ? chat") + throw failure("lossy conversion from UTF-8 should have worked"); +} + int main(int argc, char **argv) { - std::cout << "1..1\n"; + std::cout << "1..2\n"; run(check_partial_files, "test partial files"); + run(check_converter, "test encoding converter"); return 0; }