mirror of
https://github.com/fmang/opustags.git
synced 2025-01-15 12:43:17 +01:00
Rework the encoding converter to support std::u8string
This commit is contained in:
parent
befae72d2a
commit
89dc000927
@ -63,7 +63,6 @@ static struct option getopt_options[] = {
|
||||
ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
|
||||
{
|
||||
options opt;
|
||||
static ot::encoding_converter to_utf8("", "UTF-8");
|
||||
const char* equal;
|
||||
ot::status rc;
|
||||
bool set_all = false;
|
||||
@ -230,7 +229,6 @@ static std::string format_value(const std::string& source)
|
||||
*/
|
||||
void ot::print_comments(const std::list<std::string>& comments, FILE* output, bool raw)
|
||||
{
|
||||
static ot::encoding_converter from_utf8("UTF-8", "");
|
||||
std::string local;
|
||||
bool has_control = false;
|
||||
for (const std::string& source_comment : comments) {
|
||||
@ -268,7 +266,6 @@ void ot::print_comments(const std::list<std::string>& comments, FILE* output, bo
|
||||
std::list<std::string> ot::read_comments(FILE* input, bool raw)
|
||||
{
|
||||
std::list<std::string> comments;
|
||||
static ot::encoding_converter to_utf8("", "UTF-8");
|
||||
comments.clear();
|
||||
char* source_line = nullptr;
|
||||
size_t buflen = 0;
|
||||
|
@ -160,25 +160,13 @@ private:
|
||||
/** Read a whole file into memory and return the read content. */
|
||||
byte_string slurp_binary_file(const char* filename);
|
||||
|
||||
/** C++ wrapper for iconv. */
|
||||
class encoding_converter {
|
||||
public:
|
||||
/**
|
||||
* Allocate the iconv conversion state, initializing the given source and destination
|
||||
* character encodings. If it's okay to have some information lost, make sure `to` ends with
|
||||
* "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
|
||||
* in the target encoding. See the documentation of iconv_open for details.
|
||||
*/
|
||||
encoding_converter(const char* from, const char* to);
|
||||
~encoding_converter();
|
||||
/**
|
||||
* Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
|
||||
* abort the processing, leaving out in an undefined state.
|
||||
*/
|
||||
std::string operator()(std::string_view in);
|
||||
private:
|
||||
iconv_t cd; /**< conversion descriptor */
|
||||
};
|
||||
/** Convert a string from the system locale’s encoding to UTF-8. */
|
||||
std::u8string encode_utf8(std::string_view);
|
||||
std::string to_utf8(std::string_view); ///< \deprecated
|
||||
|
||||
/** Convert a string from UTF-8 to the system locale’s encoding. */
|
||||
std::string decode_utf8(std::u8string_view);
|
||||
std::string from_utf8(std::string_view); ///< \deprecated
|
||||
|
||||
/** Escape a string so that a POSIX shell interprets it as a single argument. */
|
||||
std::string shell_escape(std::string_view word);
|
||||
|
@ -143,24 +143,47 @@ ot::byte_string ot::slurp_binary_file(const char* filename)
|
||||
return content;
|
||||
}
|
||||
|
||||
ot::encoding_converter::encoding_converter(const char* from, const char* to)
|
||||
/** C++ wrapper for iconv. */
|
||||
class encoding_converter {
|
||||
public:
|
||||
/**
|
||||
* Allocate the iconv conversion state, initializing the given source and destination
|
||||
* character encodings. If it's okay to have some information lost, make sure `to` ends with
|
||||
* "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
|
||||
* in the target encoding. See the documentation of iconv_open for details.
|
||||
*/
|
||||
encoding_converter(const char* from, const char* to);
|
||||
~encoding_converter();
|
||||
/**
|
||||
* Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
|
||||
* abort the processing, leaving out in an undefined state.
|
||||
*/
|
||||
template<class InChar, class OutChar>
|
||||
std::basic_string<OutChar> convert(std::basic_string_view<InChar>);
|
||||
private:
|
||||
iconv_t cd; /**< conversion descriptor */
|
||||
};
|
||||
|
||||
encoding_converter::encoding_converter(const char* from, const char* to)
|
||||
{
|
||||
cd = iconv_open(to, from);
|
||||
if (cd == (iconv_t) -1)
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
ot::encoding_converter::~encoding_converter()
|
||||
encoding_converter::~encoding_converter()
|
||||
{
|
||||
iconv_close(cd);
|
||||
}
|
||||
|
||||
std::string ot::encoding_converter::operator()(std::string_view in)
|
||||
template<class InChar, class OutChar>
|
||||
std::basic_string<OutChar> encoding_converter::convert(std::basic_string_view<InChar> in)
|
||||
{
|
||||
iconv(cd, nullptr, nullptr, nullptr, nullptr);
|
||||
std::string out;
|
||||
std::basic_string<OutChar> out;
|
||||
out.reserve(in.size());
|
||||
char* in_cursor = const_cast<char*>(in.data());
|
||||
const char* in_data = reinterpret_cast<const char*>(in.data());
|
||||
char* in_cursor = const_cast<char*>(in_data);
|
||||
size_t in_left = in.size();
|
||||
constexpr size_t chunk_size = 1024;
|
||||
char chunk[chunk_size];
|
||||
@ -172,13 +195,13 @@ std::string ot::encoding_converter::operator()(std::string_view in)
|
||||
if (rc == (size_t) -1 && errno == E2BIG) {
|
||||
// Loop normally.
|
||||
} else if (rc == (size_t) -1) {
|
||||
throw status {ot::st::badly_encoded, strerror(errno) + "."s};
|
||||
throw ot::status {ot::st::badly_encoded, strerror(errno) + "."s};
|
||||
} else if (rc != 0) {
|
||||
throw status {ot::st::badly_encoded,
|
||||
"Some characters could not be converted into the target encoding."};
|
||||
throw ot::status {ot::st::badly_encoded,
|
||||
"Some characters could not be converted into the target encoding."};
|
||||
}
|
||||
|
||||
out.append(chunk, out_cursor - chunk);
|
||||
out.append(reinterpret_cast<OutChar*>(chunk), out_cursor - chunk);
|
||||
if (in_cursor == nullptr)
|
||||
break;
|
||||
else if (in_left == 0)
|
||||
@ -187,6 +210,29 @@ std::string ot::encoding_converter::operator()(std::string_view in)
|
||||
return out;
|
||||
}
|
||||
|
||||
static encoding_converter to_utf8_cvt("", "UTF-8");
|
||||
static encoding_converter from_utf8_cvt("UTF-8", "");
|
||||
|
||||
std::u8string ot::encode_utf8(std::string_view in)
|
||||
{
|
||||
return to_utf8_cvt.convert<char, char8_t>(in);
|
||||
}
|
||||
|
||||
std::string ot::to_utf8(std::string_view in)
|
||||
{
|
||||
return to_utf8_cvt.convert<char, char>(in);
|
||||
}
|
||||
|
||||
std::string ot::decode_utf8(std::u8string_view in)
|
||||
{
|
||||
return from_utf8_cvt.convert<char8_t, char>(in);
|
||||
}
|
||||
|
||||
std::string ot::from_utf8(std::string_view in)
|
||||
{
|
||||
return from_utf8_cvt.convert<char, char>(in);
|
||||
}
|
||||
|
||||
std::string ot::shell_escape(std::string_view word)
|
||||
{
|
||||
std::string escaped_word;
|
||||
|
12
t/system.cc
12
t/system.cc
@ -48,15 +48,15 @@ void check_slurp()
|
||||
|
||||
void check_converter()
|
||||
{
|
||||
const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65";
|
||||
ot::encoding_converter to_utf8("ISO_8859-1", "UTF-8");
|
||||
ot::encoding_converter from_utf8("UTF-8", "ISO_8859-1");
|
||||
is(ot::from_utf8(ot::to_utf8("Éphémère")), "Éphémère", "from_utf8 reverts to_utf8");
|
||||
is(ot::to_utf8(ot::from_utf8("Éphémère")), "Éphémère", "to_utf8 reverts from_utf8");
|
||||
|
||||
is(to_utf8(ephemere_iso), "Éphémère", "conversion to UTF-8 is correct");
|
||||
is(from_utf8("Éphémère"), ephemere_iso, "conversion from UTF-8 is correct");
|
||||
is(ot::decode_utf8(ot::encode_utf8("Éphémère")), "Éphémère", "decode_utf8 reverts encode_utf8");
|
||||
opaque_is(ot::encode_utf8(ot::decode_utf8(u8"Éphémère")), u8"Éphémère",
|
||||
"encode_utf8 reverts decode_utf8");
|
||||
|
||||
try {
|
||||
from_utf8("\xFF\xFF");
|
||||
ot::from_utf8("\xFF\xFF");
|
||||
throw failure("conversion from bad UTF-8 did not fail");
|
||||
} catch (const ot::status&) {}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user