Support multiline tags in non-UTF-8 environments

The \t to \t\n substitution assumed an ASCII-compatible environment and would not have worked under UTF-16. It’s therefore safer to perform the operation before the encoding conversion.
2024-09-19 23:32:36 +02:00 · 2023-02-07 15:24:53 +09:00 · 2023-02-07 15:24:53 +09:00 · 6d6722fb24
commit 6d6722fb24
parent d95fd45aef
2 changed files with 48 additions and 57 deletions
--- a/src/cli.cc
+++ b/src/cli.cc
@ -15,8 +15,6 @@
 #include <string.h>
 #include <sys/stat.h>
 #include <unistd.h>
-#include <assert.h>
-#include <cctype>		// for std::iscntrl

 using namespace std::literals::string_literals;

@ -172,77 +170,66 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 	return opt;
 }

+/** Format a UTF-8 string by adding tabulations (\t) after line feeds (\n) to mark continuation for
+ *  multiline values. */
+static std::string format_value(const std::string& source)
+{
+	auto newline_count = std::count(source.begin(), source.end(), '\n');
+
+	// General case: the value fits on a single line. Use std::string’s copy constructor for the
+	// most efficient copy we could hope for.
+	if (newline_count == 0)
+		return source;
+
+	std::string formatted;
+	formatted.reserve(source.size() + newline_count);
+	for (auto c : source) {
+		formatted.push_back(c);
+		if (c == '\n')
+			formatted.push_back('\t');
+	}
+	return formatted;
+}
+
 /**
- * Print comments in a human readable format that can also be read
- * back in by #read_comment.
+ * Print comments in a human readable format that can also be read back in by #read_comment.
 *
- * To disambiguate between a newline embedded in a comment and a
- * newline representing the start of the next tag, continuation lines
- * always have a single TAB (^I) character added to the beginning.
- * 
+ * To disambiguate between a newline embedded in a comment and a newline representing the start of
+ * the next tag, continuation lines always have a single TAB (^I) character added to the beginning.
 */
 void ot::print_comments(const std::list<std::string>& comments, FILE* output, bool raw)
 {
 	static ot::encoding_converter from_utf8("UTF-8", "");
 	std::string local;
-	bool has_newline = false;
 	bool has_control = false;
-	for (const std::string& utf8_comment : comments) {
-		const std::string* commentp;
+	for (const std::string& source_comment : comments) {
+		if (!has_control) { // Don’t bother analyzing comments if the flag is already up.
+			for (unsigned char c : source_comment) {
+				if (c < 0x20 && c != '\n') {
+					has_control = true;
+					break;
+				}
+			}
+		}
+
+		std::string utf8_comment = format_value(source_comment);
+		const std::string* comment;
 		// Convert the comment from UTF-8 to the system encoding if relevant.
 		if (raw) {
-			commentp = &utf8_comment;
+			comment = &utf8_comment;
 		} else {
 			try {
 				local = from_utf8(utf8_comment);
-				commentp = &local;
+				comment = &local;
 			} catch (ot::status& rc) {
 				rc.message += " See --raw.";
 				throw;
 			}
 		}

-		// Check for embedded newlines so we can insert TAB afterward
-		std::string comment = *commentp;
-		unsigned int newline_count = 0;
-		for (int t = 0; t < comment.length(); t++) {
-			if (comment[t]  == '\n') {
-				newline_count++;
-			}
-			else if (std::iscntrl(comment[t])) {
-				has_control = true;
-			}
-		}
-
-		
-		// Copy byte by byte into a new string with TAB added after each newline
-		std::string tabbed_comment;
-		if (newline_count) {
-			tabbed_comment.resize(comment.length() + newline_count);
-			tabbed_comment.reserve( tabbed_comment.size() );
-		  int tabs_done = 0;
-		  for (int t = 0; t < comment.length(); t++) {
-			  tabbed_comment[t + tabs_done] = comment[t];
-			  if (comment[t] == '\n') {
-				  tabs_done++;
-				  tabbed_comment[t+tabs_done] = '\t';
-			  }
-		  }
-
-		  // Assertion: Inserted as many tabs as newlines were found.
-		  assert(newline_count == tabs_done);
-		  // Assertion: Length of new string is exactly as allocated.
-		  assert(tabbed_comment.length() == comment.length() + newline_count);
-
-		  fwrite(tabbed_comment.data(), 1, tabbed_comment.size(), output);
-		}
-		else {
-			fwrite(comment.data(), 1, comment.size(), output);
-		}
-
+		fwrite(comment->data(), 1, comment->size(), output);
 		putc('\n', output);
 	}
-
 	if (has_control)
 		fputs("warning: Some tags contain control characters.\n", stderr);
 }
--- a/t/opustags.t
+++ b/t/opustags.t
@ -4,7 +4,7 @@ use strict;
 use warnings;
 use utf8;

-use Test::More tests => 54;
+use Test::More tests => 55;

 use Digest::MD5;
 use File::Basename;
@ -158,14 +158,18 @@ error: Comment does not contain an equal sign: FOO.
 EOF
 is(md5('out.opus'), '66780307a6081523dc9040f3c47b0448', 'the file did not change');

-is_deeply(opustags('out.opus', '-D', '-a', "X=foo\nbar\tquux"), [<<'END_OUT', <<'END_ERR', 0], 'control characters');
-X=foo
-bar	quux
+is_deeply(opustags('out.opus', '-D', '-a', "X=foobar\tquux"), [<<'END_OUT', <<'END_ERR', 0], 'control characters');
+X=foobar	quux
 END_OUT
-warning: Some tags contain unsupported newline characters.
 warning: Some tags contain control characters.
 END_ERR

+is_deeply(opustags('out.opus', '-D', '-a', "X=foo\n\nbar"), [<<'END_OUT', '', 0], 'newline characters');
+X=foo
+	
+	bar
+END_OUT
+
 is_deeply(opustags(qw(-i out.opus -s fatal=yes -s FOO -s BAR)), ['', <<'EOF', 512], 'bad tag with --set');
 error: Comment does not contain an equal sign: FOO.
 EOF