mirror of
https://github.com/fmang/opustags.git
synced 2025-01-15 20:53:16 +01:00
character encoding converter
This commit is contained in:
parent
ca06c6fb9d
commit
ebc8347c9e
@ -23,6 +23,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iconv.h>
|
||||
#include <ogg/ogg.h>
|
||||
#include <stdio.h>
|
||||
|
||||
@ -55,6 +56,9 @@ enum class st {
|
||||
error,
|
||||
standard_error, /**< Error raised by the C standard library. */
|
||||
int_overflow,
|
||||
/* System */
|
||||
badly_encoded,
|
||||
information_lost,
|
||||
/* Ogg */
|
||||
bad_stream,
|
||||
end_of_stream,
|
||||
@ -133,6 +137,27 @@ private:
|
||||
ot::file file;
|
||||
};
|
||||
|
||||
/** C++ wrapper for iconv. */
|
||||
class encoding_converter {
|
||||
public:
|
||||
/**
|
||||
* Allocate the iconv conversion state, initializing the given source and destination
|
||||
* character encodings. If it's okay to have some information lost, make sure `to` ends with
|
||||
* "//TRANSLIT", otherwise the conversion will fail when a character cannot be represented
|
||||
* in the target encoding. See the documentation of iconv_open for details.
|
||||
*/
|
||||
encoding_converter(const char* from, const char* to);
|
||||
~encoding_converter();
|
||||
/**
|
||||
* Convert text using iconv. If the input sequence is invalid, return #st::badly_encoded and
|
||||
* abort the processing. If some character could not be converted perfectly, keep converting
|
||||
* the string and finally return #st::information_lost.
|
||||
*/
|
||||
status operator()(const std::string& in, std::string& out);
|
||||
private:
|
||||
iconv_t cd; /**< conversion descriptor */
|
||||
};
|
||||
|
||||
/** \} */
|
||||
|
||||
/***********************************************************************************************//**
|
||||
|
@ -50,3 +50,47 @@ void ot::partial_file::abort()
|
||||
file.reset();
|
||||
remove(temporary_name.c_str());
|
||||
}
|
||||
|
||||
ot::encoding_converter::encoding_converter(const char* from, const char* to)
|
||||
{
|
||||
cd = iconv_open(to, from);
|
||||
if (cd == (iconv_t) -1)
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
ot::encoding_converter::~encoding_converter()
|
||||
{
|
||||
iconv_close(cd);
|
||||
}
|
||||
|
||||
ot::status ot::encoding_converter::operator()(const std::string& in, std::string& out)
|
||||
{
|
||||
iconv(cd, nullptr, nullptr, nullptr, nullptr);
|
||||
out.clear();
|
||||
out.reserve(in.size());
|
||||
char* in_cursor = const_cast<char*>(in.data());
|
||||
size_t in_left = in.size();
|
||||
constexpr size_t chunk_size = 1024;
|
||||
char chunk[chunk_size];
|
||||
bool lost_information = false;
|
||||
for (;;) {
|
||||
char *out_cursor = chunk;
|
||||
size_t out_left = chunk_size;
|
||||
size_t rc = iconv(cd, &in_cursor, &in_left, &out_cursor, &out_left);
|
||||
if (rc == (size_t) -1 && errno != E2BIG)
|
||||
return {ot::st::badly_encoded,
|
||||
"Could not convert string '" + in + "': " + strerror(errno)};
|
||||
if (rc != 0)
|
||||
lost_information = true;
|
||||
out.append(chunk, out_cursor - chunk);
|
||||
if (in_cursor == nullptr)
|
||||
break;
|
||||
else if (in_left == 0)
|
||||
in_cursor = nullptr;
|
||||
}
|
||||
if (lost_information)
|
||||
return {ot::st::information_lost,
|
||||
"Some characters could not be converted into the target encoding "
|
||||
"in string '" + in + "'."};
|
||||
return ot::st::ok;
|
||||
}
|
||||
|
24
t/system.cc
24
t/system.cc
@ -36,9 +36,31 @@ void check_partial_files()
|
||||
throw failure("could not remove the result file");
|
||||
}
|
||||
|
||||
void check_converter()
|
||||
{
|
||||
const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65";
|
||||
ot::encoding_converter to_utf8("ISO-8859-1", "UTF-8");
|
||||
std::string out;
|
||||
ot::status rc = to_utf8(ephemere_iso, out);
|
||||
if (rc != ot::st::ok || out != "Éphémère")
|
||||
throw failure("conversion to UTF-8 should have worked");
|
||||
|
||||
ot::encoding_converter from_utf8("UTF-8", "ISO-8859-1//TRANSLIT");
|
||||
rc = from_utf8("Éphémère", out);
|
||||
if (rc != ot::st::ok || out != ephemere_iso)
|
||||
throw failure("conversion from UTF-8 should have worked");
|
||||
rc = from_utf8("\xFF\xFF", out);
|
||||
if (rc != ot::st::badly_encoded)
|
||||
throw failure("conversion from bad UTF-8 should have failed");
|
||||
rc = from_utf8("cat 猫 chat", out);
|
||||
if (rc != ot::st::information_lost || out != "cat ? chat")
|
||||
throw failure("lossy conversion from UTF-8 should have worked");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
std::cout << "1..1\n";
|
||||
std::cout << "1..2\n";
|
||||
run(check_partial_files, "test partial files");
|
||||
run(check_converter, "test encoding converter");
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user