From 89dc0009278b4fdfb115742094e469057b78154e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Mangano?= Date: Fri, 3 Mar 2023 11:22:18 +0900 Subject: [PATCH] Rework the encoding converter to support std::u8string --- src/cli.cc | 3 --- src/opustags.h | 26 ++++++-------------- src/system.cc | 64 +++++++++++++++++++++++++++++++++++++++++++------- t/system.cc | 12 +++++----- 4 files changed, 68 insertions(+), 37 deletions(-) diff --git a/src/cli.cc b/src/cli.cc index bc9a3d5..1f3bb4a 100644 --- a/src/cli.cc +++ b/src/cli.cc @@ -63,7 +63,6 @@ static struct option getopt_options[] = { ot::options ot::parse_options(int argc, char** argv, FILE* comments_input) { options opt; - static ot::encoding_converter to_utf8("", "UTF-8"); const char* equal; ot::status rc; bool set_all = false; @@ -230,7 +229,6 @@ static std::string format_value(const std::string& source) */ void ot::print_comments(const std::list& comments, FILE* output, bool raw) { - static ot::encoding_converter from_utf8("UTF-8", ""); std::string local; bool has_control = false; for (const std::string& source_comment : comments) { @@ -268,7 +266,6 @@ void ot::print_comments(const std::list& comments, FILE* output, bo std::list ot::read_comments(FILE* input, bool raw) { std::list comments; - static ot::encoding_converter to_utf8("", "UTF-8"); comments.clear(); char* source_line = nullptr; size_t buflen = 0; diff --git a/src/opustags.h b/src/opustags.h index 36614b0..cfb80d8 100644 --- a/src/opustags.h +++ b/src/opustags.h @@ -160,25 +160,13 @@ private: /** Read a whole file into memory and return the read content. */ byte_string slurp_binary_file(const char* filename); -/** C++ wrapper for iconv. */ -class encoding_converter { -public: - /** - * Allocate the iconv conversion state, initializing the given source and destination - * character encodings. If it's okay to have some information lost, make sure `to` ends with - * "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented - * in the target encoding. See the documentation of iconv_open for details. - */ - encoding_converter(const char* from, const char* to); - ~encoding_converter(); - /** - * Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and - * abort the processing, leaving out in an undefined state. - */ - std::string operator()(std::string_view in); -private: - iconv_t cd; /**< conversion descriptor */ -}; +/** Convert a string from the system locale’s encoding to UTF-8. */ +std::u8string encode_utf8(std::string_view); +std::string to_utf8(std::string_view); ///< \deprecated + +/** Convert a string from UTF-8 to the system locale’s encoding. */ +std::string decode_utf8(std::u8string_view); +std::string from_utf8(std::string_view); ///< \deprecated /** Escape a string so that a POSIX shell interprets it as a single argument. */ std::string shell_escape(std::string_view word); diff --git a/src/system.cc b/src/system.cc index 6d26703..431e040 100644 --- a/src/system.cc +++ b/src/system.cc @@ -143,24 +143,47 @@ ot::byte_string ot::slurp_binary_file(const char* filename) return content; } -ot::encoding_converter::encoding_converter(const char* from, const char* to) +/** C++ wrapper for iconv. */ +class encoding_converter { +public: + /** + * Allocate the iconv conversion state, initializing the given source and destination + * character encodings. If it's okay to have some information lost, make sure `to` ends with + * "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented + * in the target encoding. See the documentation of iconv_open for details. + */ + encoding_converter(const char* from, const char* to); + ~encoding_converter(); + /** + * Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and + * abort the processing, leaving out in an undefined state. + */ + template + std::basic_string convert(std::basic_string_view); +private: + iconv_t cd; /**< conversion descriptor */ +}; + +encoding_converter::encoding_converter(const char* from, const char* to) { cd = iconv_open(to, from); if (cd == (iconv_t) -1) throw std::bad_alloc(); } -ot::encoding_converter::~encoding_converter() +encoding_converter::~encoding_converter() { iconv_close(cd); } -std::string ot::encoding_converter::operator()(std::string_view in) +template +std::basic_string encoding_converter::convert(std::basic_string_view in) { iconv(cd, nullptr, nullptr, nullptr, nullptr); - std::string out; + std::basic_string out; out.reserve(in.size()); - char* in_cursor = const_cast(in.data()); + const char* in_data = reinterpret_cast(in.data()); + char* in_cursor = const_cast(in_data); size_t in_left = in.size(); constexpr size_t chunk_size = 1024; char chunk[chunk_size]; @@ -172,13 +195,13 @@ std::string ot::encoding_converter::operator()(std::string_view in) if (rc == (size_t) -1 && errno == E2BIG) { // Loop normally. } else if (rc == (size_t) -1) { - throw status {ot::st::badly_encoded, strerror(errno) + "."s}; + throw ot::status {ot::st::badly_encoded, strerror(errno) + "."s}; } else if (rc != 0) { - throw status {ot::st::badly_encoded, - "Some characters could not be converted into the target encoding."}; + throw ot::status {ot::st::badly_encoded, + "Some characters could not be converted into the target encoding."}; } - out.append(chunk, out_cursor - chunk); + out.append(reinterpret_cast(chunk), out_cursor - chunk); if (in_cursor == nullptr) break; else if (in_left == 0) @@ -187,6 +210,29 @@ std::string ot::encoding_converter::operator()(std::string_view in) return out; } +static encoding_converter to_utf8_cvt("", "UTF-8"); +static encoding_converter from_utf8_cvt("UTF-8", ""); + +std::u8string ot::encode_utf8(std::string_view in) +{ + return to_utf8_cvt.convert(in); +} + +std::string ot::to_utf8(std::string_view in) +{ + return to_utf8_cvt.convert(in); +} + +std::string ot::decode_utf8(std::u8string_view in) +{ + return from_utf8_cvt.convert(in); +} + +std::string ot::from_utf8(std::string_view in) +{ + return from_utf8_cvt.convert(in); +} + std::string ot::shell_escape(std::string_view word) { std::string escaped_word; diff --git a/t/system.cc b/t/system.cc index f0b0e0a..c8deae7 100644 --- a/t/system.cc +++ b/t/system.cc @@ -48,15 +48,15 @@ void check_slurp() void check_converter() { - const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65"; - ot::encoding_converter to_utf8("ISO_8859-1", "UTF-8"); - ot::encoding_converter from_utf8("UTF-8", "ISO_8859-1"); + is(ot::from_utf8(ot::to_utf8("Éphémère")), "Éphémère", "from_utf8 reverts to_utf8"); + is(ot::to_utf8(ot::from_utf8("Éphémère")), "Éphémère", "to_utf8 reverts from_utf8"); - is(to_utf8(ephemere_iso), "Éphémère", "conversion to UTF-8 is correct"); - is(from_utf8("Éphémère"), ephemere_iso, "conversion from UTF-8 is correct"); + is(ot::decode_utf8(ot::encode_utf8("Éphémère")), "Éphémère", "decode_utf8 reverts encode_utf8"); + opaque_is(ot::encode_utf8(ot::decode_utf8(u8"Éphémère")), u8"Éphémère", + "encode_utf8 reverts decode_utf8"); try { - from_utf8("\xFF\xFF"); + ot::from_utf8("\xFF\xFF"); throw failure("conversion from bad UTF-8 did not fail"); } catch (const ot::status&) {} }