Rework the encoding converter to support std::u8string

This commit is contained in:
Frédéric Mangano 2023-03-03 11:22:18 +09:00
parent befae72d2a
commit 89dc000927
4 changed files with 68 additions and 37 deletions

View File

@ -63,7 +63,6 @@ static struct option getopt_options[] = {
ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
{
options opt;
static ot::encoding_converter to_utf8("", "UTF-8");
const char* equal;
ot::status rc;
bool set_all = false;
@ -230,7 +229,6 @@ static std::string format_value(const std::string& source)
*/
void ot::print_comments(const std::list<std::string>& comments, FILE* output, bool raw)
{
static ot::encoding_converter from_utf8("UTF-8", "");
std::string local;
bool has_control = false;
for (const std::string& source_comment : comments) {
@ -268,7 +266,6 @@ void ot::print_comments(const std::list<std::string>& comments, FILE* output, bo
std::list<std::string> ot::read_comments(FILE* input, bool raw)
{
std::list<std::string> comments;
static ot::encoding_converter to_utf8("", "UTF-8");
comments.clear();
char* source_line = nullptr;
size_t buflen = 0;

View File

@ -160,25 +160,13 @@ private:
/** Read a whole file into memory and return the read content. */
byte_string slurp_binary_file(const char* filename);
/** C++ wrapper for iconv. */
class encoding_converter {
public:
/**
* Allocate the iconv conversion state, initializing the given source and destination
* character encodings. If it's okay to have some information lost, make sure `to` ends with
* "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
* in the target encoding. See the documentation of iconv_open for details.
*/
encoding_converter(const char* from, const char* to);
~encoding_converter();
/**
* Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
* abort the processing, leaving out in an undefined state.
*/
std::string operator()(std::string_view in);
private:
iconv_t cd; /**< conversion descriptor */
};
/** Convert a string from the system locales encoding to UTF-8. */
std::u8string encode_utf8(std::string_view);
std::string to_utf8(std::string_view); ///< \deprecated
/** Convert a string from UTF-8 to the system locales encoding. */
std::string decode_utf8(std::u8string_view);
std::string from_utf8(std::string_view); ///< \deprecated
/** Escape a string so that a POSIX shell interprets it as a single argument. */
std::string shell_escape(std::string_view word);

View File

@ -143,24 +143,47 @@ ot::byte_string ot::slurp_binary_file(const char* filename)
return content;
}
ot::encoding_converter::encoding_converter(const char* from, const char* to)
/** C++ wrapper for iconv. */
class encoding_converter {
public:
/**
* Allocate the iconv conversion state, initializing the given source and destination
* character encodings. If it's okay to have some information lost, make sure `to` ends with
* "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
* in the target encoding. See the documentation of iconv_open for details.
*/
encoding_converter(const char* from, const char* to);
~encoding_converter();
/**
* Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
* abort the processing, leaving out in an undefined state.
*/
template<class InChar, class OutChar>
std::basic_string<OutChar> convert(std::basic_string_view<InChar>);
private:
iconv_t cd; /**< conversion descriptor */
};
encoding_converter::encoding_converter(const char* from, const char* to)
{
cd = iconv_open(to, from);
if (cd == (iconv_t) -1)
throw std::bad_alloc();
}
ot::encoding_converter::~encoding_converter()
encoding_converter::~encoding_converter()
{
iconv_close(cd);
}
std::string ot::encoding_converter::operator()(std::string_view in)
template<class InChar, class OutChar>
std::basic_string<OutChar> encoding_converter::convert(std::basic_string_view<InChar> in)
{
iconv(cd, nullptr, nullptr, nullptr, nullptr);
std::string out;
std::basic_string<OutChar> out;
out.reserve(in.size());
char* in_cursor = const_cast<char*>(in.data());
const char* in_data = reinterpret_cast<const char*>(in.data());
char* in_cursor = const_cast<char*>(in_data);
size_t in_left = in.size();
constexpr size_t chunk_size = 1024;
char chunk[chunk_size];
@ -172,13 +195,13 @@ std::string ot::encoding_converter::operator()(std::string_view in)
if (rc == (size_t) -1 && errno == E2BIG) {
// Loop normally.
} else if (rc == (size_t) -1) {
throw status {ot::st::badly_encoded, strerror(errno) + "."s};
throw ot::status {ot::st::badly_encoded, strerror(errno) + "."s};
} else if (rc != 0) {
throw status {ot::st::badly_encoded,
"Some characters could not be converted into the target encoding."};
throw ot::status {ot::st::badly_encoded,
"Some characters could not be converted into the target encoding."};
}
out.append(chunk, out_cursor - chunk);
out.append(reinterpret_cast<OutChar*>(chunk), out_cursor - chunk);
if (in_cursor == nullptr)
break;
else if (in_left == 0)
@ -187,6 +210,29 @@ std::string ot::encoding_converter::operator()(std::string_view in)
return out;
}
static encoding_converter to_utf8_cvt("", "UTF-8");
static encoding_converter from_utf8_cvt("UTF-8", "");
std::u8string ot::encode_utf8(std::string_view in)
{
return to_utf8_cvt.convert<char, char8_t>(in);
}
std::string ot::to_utf8(std::string_view in)
{
return to_utf8_cvt.convert<char, char>(in);
}
std::string ot::decode_utf8(std::u8string_view in)
{
return from_utf8_cvt.convert<char8_t, char>(in);
}
std::string ot::from_utf8(std::string_view in)
{
return from_utf8_cvt.convert<char, char>(in);
}
std::string ot::shell_escape(std::string_view word)
{
std::string escaped_word;

View File

@ -48,15 +48,15 @@ void check_slurp()
void check_converter()
{
const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65";
ot::encoding_converter to_utf8("ISO_8859-1", "UTF-8");
ot::encoding_converter from_utf8("UTF-8", "ISO_8859-1");
is(ot::from_utf8(ot::to_utf8("Éphémère")), "Éphémère", "from_utf8 reverts to_utf8");
is(ot::to_utf8(ot::from_utf8("Éphémère")), "Éphémère", "to_utf8 reverts from_utf8");
is(to_utf8(ephemere_iso), "Éphémère", "conversion to UTF-8 is correct");
is(from_utf8("Éphémère"), ephemere_iso, "conversion from UTF-8 is correct");
is(ot::decode_utf8(ot::encode_utf8("Éphémère")), "Éphémère", "decode_utf8 reverts encode_utf8");
opaque_is(ot::encode_utf8(ot::decode_utf8(u8"Éphémère")), u8"Éphémère",
"encode_utf8 reverts decode_utf8");
try {
from_utf8("\xFF\xFF");
ot::from_utf8("\xFF\xFF");
throw failure("conversion from bad UTF-8 did not fail");
} catch (const ot::status&) {}
}