character encoding converter

This commit is contained in:
Frédéric Mangano-Tarumi 2018-12-09 11:45:00 -05:00
parent ca06c6fb9d
commit ebc8347c9e
3 changed files with 92 additions and 1 deletions

View File

@ -23,6 +23,7 @@
#pragma once
#include <iconv.h>
#include <ogg/ogg.h>
#include <stdio.h>
@ -55,6 +56,9 @@ enum class st {
error,
standard_error, /**< Error raised by the C standard library. */
int_overflow,
/* System */
badly_encoded,
information_lost,
/* Ogg */
bad_stream,
end_of_stream,
@ -133,6 +137,27 @@ private:
ot::file file;
};
/** C++ wrapper for iconv. */
class encoding_converter {
public:
/**
* Allocate the iconv conversion state, initializing the given source and destination
* character encodings. If it's okay to have some information lost, make sure `to` ends with
* "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
* in the target encoding. See the documentation of iconv_open for details.
*/
encoding_converter(const char* from, const char* to);
~encoding_converter();
/**
* Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
* abort the processing. If some character could not be converted perfectly, keep converting
* the string and finally return #st::information_lost.
*/
status operator()(const std::string& in, std::string& out);
private:
iconv_t cd; /**< conversion descriptor */
};
/** \} */
/***********************************************************************************************//**

View File

@ -50,3 +50,47 @@ void ot::partial_file::abort()
file.reset();
remove(temporary_name.c_str());
}
ot::encoding_converter::encoding_converter(const char* from, const char* to)
{
cd = iconv_open(to, from);
if (cd == (iconv_t) -1)
throw std::bad_alloc();
}
ot::encoding_converter::~encoding_converter()
{
iconv_close(cd);
}
ot::status ot::encoding_converter::operator()(const std::string& in, std::string& out)
{
iconv(cd, nullptr, nullptr, nullptr, nullptr);
out.clear();
out.reserve(in.size());
char* in_cursor = const_cast<char*>(in.data());
size_t in_left = in.size();
constexpr size_t chunk_size = 1024;
char chunk[chunk_size];
bool lost_information = false;
for (;;) {
char *out_cursor = chunk;
size_t out_left = chunk_size;
size_t rc = iconv(cd, &in_cursor, &in_left, &out_cursor, &out_left);
if (rc == (size_t) -1 && errno != E2BIG)
return {ot::st::badly_encoded,
"Could not convert string '" + in + "': " + strerror(errno)};
if (rc != 0)
lost_information = true;
out.append(chunk, out_cursor - chunk);
if (in_cursor == nullptr)
break;
else if (in_left == 0)
in_cursor = nullptr;
}
if (lost_information)
return {ot::st::information_lost,
"Some characters could not be converted into the target encoding "
"in string '" + in + "'."};
return ot::st::ok;
}

View File

@ -36,9 +36,31 @@ void check_partial_files()
throw failure("could not remove the result file");
}
void check_converter()
{
const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65";
ot::encoding_converter to_utf8("ISO-8859-1", "UTF-8");
std::string out;
ot::status rc = to_utf8(ephemere_iso, out);
if (rc != ot::st::ok || out != "Éphémère")
throw failure("conversion to UTF-8 should have worked");
ot::encoding_converter from_utf8("UTF-8", "ISO-8859-1//TRANSLIT");
rc = from_utf8("Éphémère", out);
if (rc != ot::st::ok || out != ephemere_iso)
throw failure("conversion from UTF-8 should have worked");
rc = from_utf8("\xFF\xFF", out);
if (rc != ot::st::badly_encoded)
throw failure("conversion from bad UTF-8 should have failed");
rc = from_utf8("cat 猫 chat", out);
if (rc != ot::st::information_lost || out != "cat ? chat")
throw failure("lossy conversion from UTF-8 should have worked");
}
int main(int argc, char **argv)
{
std::cout << "1..1\n";
std::cout << "1..2\n";
run(check_partial_files, "test partial files");
run(check_converter, "test encoding converter");
return 0;
}