Use //IGNORE instead of //TRANSLIT when transcoding

//TRANSLIT is not a well supported, and in most cases there’s not much transliteration can help with when the encoding is limiting. Besides, it sounds reasonable to assume most people use UTF-8 nowadays.
2025-07-14 21:15:27 +02:00 · 2020-12-26 11:53:05 +01:00
parent f98208c1a1
commit c43704a0a7
4 changed files with 21 additions and 11 deletions
--- a/src/cli.cc
+++ b/src/cli.cc
@ -171,7 +171,7 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm
 */
 void ot::print_comments(const std::list<std::string>& comments, FILE* output)
 {
-	static ot::encoding_converter from_utf8("UTF-8", "//TRANSLIT");
+	static ot::encoding_converter from_utf8("UTF-8", "//IGNORE");
 	std::string local;
 	bool info_lost = false;
 	bool bad_comments = false;
@ -195,7 +195,7 @@ void ot::print_comments(const std::list<std::string>& comments, FILE* output)
 		putc('\n', output);
 	}
 	if (info_lost)
-		fputs("warning: Some tags have been transliterated to your system encoding.\n", stderr);
+		fputs("warning: Some characters are not supported by your system encoding and have been discarded.\n", stderr);
 	if (bad_comments)
 		fputs("warning: Some tags are not properly encoded and have not been displayed.\n", stderr);
 	if (has_newline)
--- a/src/system.cc
+++ b/src/system.cc
@ -118,13 +118,24 @@ ot::status ot::encoding_converter::operator()(const char* in, size_t n, std::str
 		char *out_cursor = chunk;
 		size_t out_left = chunk_size;
 		size_t rc = iconv(cd, &in_cursor, &in_left, &out_cursor, &out_left);
-		if (rc == (size_t) -1 && errno != E2BIG)
+		out.append(chunk, out_cursor - chunk);
+
+		// With //IGNORE, iconv yields EILSEQ on bad conversion but still returns reasonable
+		// data. Note than EILSEQ is returned at the very end so it’s basically like a fatal
+		// error on the last chunk. When the output buffer is too small, it yields E2BIG and
+		// we need to loop. Any other error is fatal. A return code other than 0 or -1
+		// indicates a lossy transliteration.
+		if (rc == (size_t) -1 && errno == EILSEQ) {
+			lost_information = true;
+			break;
+		} else if (rc == (size_t) -1 && errno != E2BIG) {
 			return {ot::st::badly_encoded,
 			        "Could not convert string '" + std::string(in, n) + "': " +
 			        strerror(errno)};
-		if (rc != 0)
+		} else if (rc != 0 && rc != (size_t) -1) {
 			lost_information = true;
-		out.append(chunk, out_cursor - chunk);
+		}
+
 		if (in_cursor == nullptr)
 			break;
 		else if (in_left == 0)
@ -132,8 +143,7 @@ ot::status ot::encoding_converter::operator()(const char* in, size_t n, std::str
 	}
 	if (lost_information)
 		return {ot::st::information_lost,
-		        "Some characters could not be converted into the target encoding "
-		        "in string '" + std::string(in, n) + "'."};
+		        "Some characters could not be converted into the target encoding."};
 	return ot::st::ok;
 }

--- a/t/opustags.t
+++ b/t/opustags.t
@ -275,11 +275,11 @@ is_deeply(opustags('-i', 'out.opus', "--add=I=\xf9\xce", {mode => ':raw'}), ['',

 is_deeply(opustags('out.opus', {mode => ':raw'}), [<<"END_OUT", <<'END_ERR', 0], 'read tags in ISO-8859-1');
 encoder=Lavc58.18.100 libopus
-TITLE=???
+TITLE=
 ARTIST=\xe9\xe0\xe7
 I=\xf9\xce
 END_OUT
-warning: Some tags have been transliterated to your system encoding.
+warning: Some characters are not supported by your system encoding and have been discarded.
 END_ERR

 $ENV{LC_ALL} = '';
--- a/t/system.cc
+++ b/t/system.cc
@ -34,7 +34,7 @@ void check_converter()
 {
 	const char* ephemere_iso = "\xc9\x70\x68\xe9\x6d\xe8\x72\x65";
 	ot::encoding_converter to_utf8("ISO_8859-1", "UTF-8");
-	ot::encoding_converter from_utf8("UTF-8", "ISO_8859-1//TRANSLIT");
+	ot::encoding_converter from_utf8("UTF-8", "ISO_8859-1//IGNORE");
 	std::string out;

 	ot::status rc = to_utf8(ephemere_iso, out);
@ -46,7 +46,7 @@ void check_converter()
 	is(out, ephemere_iso, "conversion from UTF-8 is correct");

 	rc = from_utf8("\xFF\xFF", out);
-	is(rc, ot::st::badly_encoded, "conversion from bad UTF-8 fails");
+	is(rc, ot::st::information_lost, "conversion from bad UTF-8 is lossy");
 }

 void check_shell_esape()