Use //IGNORE instead of //TRANSLIT when transcoding

//TRANSLIT is not a well supported, and in most cases there’s not much
transliteration can help with when the encoding is limiting. Besides,
it sounds reasonable to assume most people use UTF-8 nowadays.
This commit is contained in:
Frédéric Mangano 2020-12-26 11:53:05 +01:00
parent f98208c1a1
commit c43704a0a7
4 changed files with 21 additions and 11 deletions

View File

@ -171,7 +171,7 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm
*/
void ot::print_comments(const std::list<std::string>& comments, FILE* output)
{
static ot::encoding_converter from_utf8("UTF-8", "//TRANSLIT");
static ot::encoding_converter from_utf8("UTF-8", "//IGNORE");
std::string local;
bool info_lost = false;
bool bad_comments = false;
@ -195,7 +195,7 @@ void ot::print_comments(const std::list<std::string>& comments, FILE* output)
putc('\n', output);
}
if (info_lost)
fputs("warning: Some tags have been transliterated to your system encoding.\n", stderr);
fputs("warning: Some characters are not supported by your system encoding and have been discarded.\n", stderr);
if (bad_comments)
fputs("warning: Some tags are not properly encoded and have not been displayed.\n", stderr);
if (has_newline)

View File

@ -118,13 +118,24 @@ ot::status ot::encoding_converter::operator()(const char* in, size_t n, std::str
char *out_cursor = chunk;
size_t out_left = chunk_size;
size_t rc = iconv(cd, &in_cursor, &in_left, &out_cursor, &out_left);
if (rc == (size_t) -1 && errno != E2BIG)
out.append(chunk, out_cursor - chunk);
// With //IGNORE, iconv yields EILSEQ on bad conversion but still returns reasonable
// data. Note than EILSEQ is returned at the very end so its basically like a fatal
// error on the last chunk. When the output buffer is too small, it yields E2BIG and
// we need to loop. Any other error is fatal. A return code other than 0 or -1
// indicates a lossy transliteration.
if (rc == (size_t) -1 && errno == EILSEQ) {
lost_information = true;
break;
} else if (rc == (size_t) -1 && errno != E2BIG) {
return {ot::st::badly_encoded,
"Could not convert string '" + std::string(in, n) + "': " +
strerror(errno)};
if (rc != 0)
} else if (rc != 0 && rc != (size_t) -1) {
lost_information = true;
out.append(chunk, out_cursor - chunk);
}
if (in_cursor == nullptr)
break;
else if (in_left == 0)
@ -132,8 +143,7 @@ ot::status ot::encoding_converter::operator()(const char* in, size_t n, std::str
}
if (lost_information)
return {ot::st::information_lost,
"Some characters could not be converted into the target encoding "
"in string '" + std::string(in, n) + "'."};
"Some characters could not be converted into the target encoding."};
return ot::st::ok;
}

View File

@ -275,11 +275,11 @@ is_deeply(opustags('-i', 'out.opus', "--add=I=\xf9\xce", {mode => ':raw'}), ['',
is_deeply(opustags('out.opus', {mode => ':raw'}), [<<"END_OUT", <<'END_ERR', 0], 'read tags in ISO-8859-1');
encoder=Lavc58.18.100 libopus
TITLE=???
TITLE=
ARTIST=\xe9\xe0\xe7
I=\xf9\xce
END_OUT
warning: Some tags have been transliterated to your system encoding.
warning: Some characters are not supported by your system encoding and have been discarded.
END_ERR
$ENV{LC_ALL} = '';

View File

@ -34,7 +34,7 @@ void check_converter()
{
const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65";
ot::encoding_converter to_utf8("ISO_8859-1", "UTF-8");
ot::encoding_converter from_utf8("UTF-8", "ISO_8859-1//TRANSLIT");
ot::encoding_converter from_utf8("UTF-8", "ISO_8859-1//IGNORE");
std::string out;
ot::status rc = to_utf8(ephemere_iso, out);
@ -46,7 +46,7 @@ void check_converter()
is(out, ephemere_iso, "conversion from UTF-8 is correct");
rc = from_utf8("\xFF\xFF", out);
is(rc, ot::st::badly_encoded, "conversion from bad UTF-8 fails");
is(rc, ot::st::information_lost, "conversion from bad UTF-8 is lossy");
}
void check_shell_esape()