From 0067162ffb1db4137fffbca01c0b3ed6a310891d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Mangano?= <fmang@mg0.fr>
Date: Tue, 30 Apr 2024 12:17:00 +0900
Subject: [PATCH] Support NUL delimiters with -z

---
 README.md      |  1 +
 opustags.1     | 22 +++++++++++++++++-----
 src/cli.cc     | 50 +++++++++++++++++++++++++++-----------------------
 src/opustags.h | 11 +++++++++--
 t/cli.cc       |  4 +++-
 t/opustags.t   | 17 ++++++++++++++++-
 6 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 5ad1e18..39e0e4a 100644
--- a/README.md
+++ b/README.md
@@ -69,5 +69,6 @@ Documentation
       --vendor                      print the vendor string
       --set-vendor VALUE            set the vendor string
       --raw                         disable encoding conversion
+      -z                            delimit tags with NUL
 
 See the man page, `opustags.1`, for extensive documentation.
diff --git a/opustags.1 b/opustags.1
index 27fa9f0..0ee39f9 100644
--- a/opustags.1
+++ b/opustags.1
@@ -1,4 +1,4 @@
-.TH opustags 1 "March 2023" "@PROJECT_NAME@ @PROJECT_VERSION@"
+.TH opustags 1 "April 2024" "@PROJECT_NAME@ @PROJECT_VERSION@"
 .SH NAME
 opustags \- Ogg Opus tag editor
 .SH SYNOPSIS
@@ -135,6 +135,14 @@ corrupted or possibly even contain intentional binary data. In that case, --raw
 kind of binary data without ensuring the validity of the tags encoding. This option may also be
 useful when your system encoding is different from UTF-8 and you wish to preserve the full UTF-8
 character set even though your system cannot display it.
+.TP
+.B \-z
+When editing tags programmatically with line-based tools like grep or sed, tags containing newlines
+are likely to corrupt the result because these tools won’t interpret multi-line tags as a whole. To
+make automatic processing easier, \fB-z\fP delimits tags by a null byte (ASCII NUL) instead of line
+feeds. That same \fB-z\fP flag is also supported by GNU grep or GNU sed and, combined with opustags
+-z, would make them process the input tag-by-tag instead of line-by-line, thus supporting multi-line
+tags as well. This option also disables the TAB prefix for continuation lines after a line feed.
 .SH EXAMPLES
 .PP
 List all the tags in file foo.opus:
@@ -145,10 +153,6 @@ Copy in.opus to out.opus, with the TITLE tag added:
 .PP
 	opustags in.opus --output out.opus --add "TITLE=Hello world!"
 .PP
-Replace all the tags in dest.opus with the ones from src.opus:
-.PP
-	opustags src.opus | opustags --in-place dest.opus --set-all
-.PP
 Remove the previously existing ARTIST tags and add the two X and Y ARTIST tags, then display the new
 tags without writing them to the Opus file:
 .PP
@@ -157,6 +161,14 @@ tags without writing them to the Opus file:
 Edit tags interactively in Vim:
 .PP
 	EDITOR=vim opustags --in-place --edit file.opus
+.PP
+Replace all the tags in dest.opus with the ones from src.opus:
+.PP
+	opustags src.opus | opustags --in-place dest.opus --set-all
+.PP
+Use GNU grep to remove all the CHAPTER* tags, with -z to support multi-line tags:
+.PP
+	opustags -z file.opus | grep -z -v ^CHAPTER | opustags -z --in-place file.opus --set-all
 .SH CAVEATS
 .PP
 \fBopustags\fP currently has the following limitations:
diff --git a/src/cli.cc b/src/cli.cc
index 7ba41f8..40bc10b 100644
--- a/src/cli.cc
+++ b/src/cli.cc
@@ -41,6 +41,7 @@ Options:
   --vendor                      print the vendor string
   --set-vendor VALUE            set the vendor string
   --raw                         disable encoding conversion
+  -z                            delimit tags with NUL
 
 See the man page for extensive documentation.
 )raw";
@@ -79,7 +80,7 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 		throw status {st::bad_arguments, "No arguments specified. Use -h for help."};
 	int c;
 	optind = 0;
-	while ((c = getopt_long(argc, argv, ":ho:iyd:a:s:DSe", getopt_options, NULL)) != -1) {
+	while ((c = getopt_long(argc, argv, ":ho:iyd:a:s:DSez", getopt_options, NULL)) != -1) {
 		switch (c) {
 		case 'h':
 			opt.print_help = true;
@@ -139,6 +140,9 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 		case 'r':
 			opt.raw = true;
 			break;
+		case 'z':
+			opt.tag_delimiter = '\0';
+			break;
 		case ':':
 			throw status {st::bad_arguments, "Missing value for option '"s + argv[optind - 1] + "'."};
 		default:
@@ -226,17 +230,17 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 
 	if (set_all) {
 		// Read comments from stdin and prepend them to opt.to_add.
-		std::list<std::u8string> comments = read_comments(comments_input, opt.raw);
+		std::list<std::u8string> comments = read_comments(comments_input, opt);
 		opt.to_add.splice(opt.to_add.begin(), std::move(comments));
 	}
 	return opt;
 }
 
 /** Format a UTF-8 string by adding tabulations (\t) after line feeds (\n) to mark continuation for
- *  multiline values. */
-static std::u8string format_value(const std::u8string& source)
+ *  multiline values. With -z, this behavior applies for embedded NUL characters instead of LF. */
+static std::u8string format_value(const std::u8string& source, const ot::options& opt)
 {
-	auto newline_count = std::count(source.begin(), source.end(), u8'\n');
+	auto newline_count = std::count(source.begin(), source.end(), opt.tag_delimiter);
 
 	// General case: the value fits on a single line. Use std::string’s copy constructor for the
 	// most efficient copy we could hope for.
@@ -247,7 +251,7 @@ static std::u8string format_value(const std::u8string& source)
 	formatted.reserve(source.size() + newline_count);
 	for (auto c : source) {
 		formatted.push_back(c);
-		if (c == '\n')
+		if (c == opt.tag_delimiter)
 			formatted.push_back(u8'\t');
 	}
 	return formatted;
@@ -257,9 +261,9 @@ static std::u8string format_value(const std::u8string& source)
  * Convert the comment from UTF-8 to the system encoding if relevant, and print it with a trailing
  * line feed.
  */
-static void puts_utf8(std::u8string_view str, FILE* output, bool raw)
+static void puts_utf8(std::u8string_view str, FILE* output, const ot::options& opt)
 {
-	if (raw) {
+	if (opt.raw) {
 		fwrite(str.data(), 1, str.size(), output);
 	} else {
 		try {
@@ -270,7 +274,7 @@ static void puts_utf8(std::u8string_view str, FILE* output, bool raw)
 			throw;
 		}
 	}
-	putc('\n', output);
+	putc(opt.tag_delimiter, output);
 }
 
 /**
@@ -279,7 +283,7 @@ static void puts_utf8(std::u8string_view str, FILE* output, bool raw)
  * To disambiguate between a newline embedded in a comment and a newline representing the start of
  * the next tag, continuation lines always have a single TAB (^I) character added to the beginning.
  */
-void ot::print_comments(const std::list<std::u8string>& comments, FILE* output, bool raw)
+void ot::print_comments(const std::list<std::u8string>& comments, FILE* output, const ot::options& opt)
 {
 	bool has_control = false;
 	for (const std::u8string& source_comment : comments) {
@@ -291,14 +295,14 @@ void ot::print_comments(const std::list<std::u8string>& comments, FILE* output,
 				}
 			}
 		}
-		std::u8string utf8_comment = format_value(source_comment);
-		puts_utf8(utf8_comment, output, raw);
+		std::u8string utf8_comment = format_value(source_comment, opt);
+		puts_utf8(utf8_comment, output, opt);
 	}
 	if (has_control)
 		fputs("warning: Some tags contain control characters.\n", stderr);
 }
 
-std::list<std::u8string> ot::read_comments(FILE* input, bool raw)
+std::list<std::u8string> ot::read_comments(FILE* input, const ot::options& opt)
 {
 	std::list<std::u8string> comments;
 	comments.clear();
@@ -306,12 +310,12 @@ std::list<std::u8string> ot::read_comments(FILE* input, bool raw)
 	size_t buflen = 0;
 	ssize_t nread;
 	std::u8string* previous_comment = nullptr;
-	while ((nread = getline(&source_line, &buflen, input)) != -1) {
-		if (nread > 0 && source_line[nread - 1] == '\n')
+	while ((nread = getdelim(&source_line, &buflen, opt.tag_delimiter, input)) != -1) {
+		if (nread > 0 && source_line[nread - 1] == opt.tag_delimiter)
 			--nread; // Chomp.
 
 		std::u8string line;
-		if (raw) {
+		if (opt.raw) {
 			line = std::u8string(reinterpret_cast<char8_t*>(source_line), nread);
 		} else {
 			try {
@@ -335,7 +339,7 @@ std::list<std::u8string> ot::read_comments(FILE* input, bool raw)
 				free(source_line);
 				throw rc;
 			} else {
-				line[0] = '\n';
+				line[0] = opt.tag_delimiter;
 				previous_comment->append(line);
 			}
 		} else if (line.find(u8'=') == decltype(line)::npos) {
@@ -391,7 +395,7 @@ static void edit_tags(ot::opus_tags& tags, const ot::options& opt)
 }
 
 /** Spawn VISUAL or EDITOR to edit the given tags. */
-static void edit_tags_interactively(ot::opus_tags& tags, const std::optional<std::string>& base_path, bool raw)
+static void edit_tags_interactively(ot::opus_tags& tags, const std::optional<std::string>& base_path, const ot::options& opt)
 {
 	const char* editor = nullptr;
 	if (getenv("TERM") != nullptr)
@@ -410,7 +414,7 @@ static void edit_tags_interactively(ot::opus_tags& tags, const std::optional<std
 	if (fd == -1 || (tags_file = fdopen(fd, "w")) == nullptr)
 		throw ot::status {ot::st::standard_error,
 		                  "Could not open '" + tags_path + "': " + strerror(errno)};
-	ot::print_comments(tags.comments, tags_file.get(), raw);
+	ot::print_comments(tags.comments, tags_file.get(), opt);
 	tags_file.reset();
 
 	// Spawn the editor, and watch the modification timestamps.
@@ -441,7 +445,7 @@ static void edit_tags_interactively(ot::opus_tags& tags, const std::optional<std
 	if (tags_file == nullptr)
 		throw ot::status {ot::st::standard_error, "Error opening " + tags_path + ": " + strerror(errno)};
 	try {
-		tags.comments = ot::read_comments(tags_file.get(), raw);
+		tags.comments = ot::read_comments(tags_file.get(), opt);
 	} catch (const ot::status& rc) {
 		fprintf(stderr, "warning: Leaving %s on the disk.\n", tags_path.c_str());
 		throw;
@@ -524,7 +528,7 @@ static void process(ot::ogg_reader& reader, ot::ogg_writer* writer, const ot::op
 			if (writer) {
 				if (opt.edit_interactively) {
 					fflush(writer->file); // flush before calling the subprocess
-					edit_tags_interactively(tags, writer->path, opt.raw);
+					edit_tags_interactively(tags, writer->path, opt);
 				}
 				auto packet = ot::render_tags(tags);
 				writer->write_header_packet(serialno, pageno, packet);
@@ -532,9 +536,9 @@ static void process(ot::ogg_reader& reader, ot::ogg_writer* writer, const ot::op
 			} else {
 				if (opt.cover_out != "-") {
 					if (opt.print_vendor)
-						puts_utf8(tags.vendor, stdout, opt.raw);
+						puts_utf8(tags.vendor, stdout, opt);
 					else
-						ot::print_comments(tags.comments, stdout, opt.raw);
+						ot::print_comments(tags.comments, stdout, opt);
 				}
 				break;
 			}
diff --git a/src/opustags.h b/src/opustags.h
index e83ad9f..93c323b 100644
--- a/src/opustags.h
+++ b/src/opustags.h
@@ -534,6 +534,13 @@ struct options {
 	 * extract and set as-is, encoding conversion would get in the way.
 	 */
 	bool raw = false;
+	/**
+	 * In text mode (default), tags are separated by a line feed. However, when combining
+	 * opustags with grep or other line-based tools, this proves to be a bad separator because
+	 * tag values may contain newlines. Changing the delimiter to '\0' with -z eases the
+	 * processing of multi-line tags with other tools that support null-terminated lines.
+	 */
+	char tag_delimiter = '\n';
 };
 
 /**
@@ -551,13 +558,13 @@ options parse_options(int argc, char** argv, FILE* comments);
  *
  * The output generated is meant to be parseable by #ot::read_comments.
  */
-void print_comments(const std::list<std::u8string>& comments, FILE* output, bool raw);
+void print_comments(const std::list<std::u8string>& comments, FILE* output, const options& opt);
 
 /**
  * Parse the comments outputted by #ot::print_comments. Unless raw is true, the comments are
  * converted from the system encoding to UTF-8, and returned as UTF-8.
  */
-std::list<std::u8string> read_comments(FILE* input, bool raw);
+std::list<std::u8string> read_comments(FILE* input, const options& opt);
 
 /**
  * Remove all comments matching the specified selector, which may either be a field name or a
diff --git a/t/cli.cc b/t/cli.cc
index 0f556e7..3af98cb 100644
--- a/t/cli.cc
+++ b/t/cli.cc
@@ -5,8 +5,10 @@
 
 static ot::status read_comments(FILE* input, std::list<std::u8string>& comments, bool raw)
 {
+	ot::options opt;
+	opt.raw = raw;
 	try {
-		comments = ot::read_comments(input, raw);
+		comments = ot::read_comments(input, opt);
 	} catch (const ot::status& rc) {
 		return rc;
 	}
diff --git a/t/opustags.t b/t/opustags.t
index 039f8f6..2718179 100755
--- a/t/opustags.t
+++ b/t/opustags.t
@@ -4,7 +4,7 @@ use strict;
 use warnings;
 use utf8;
 
-use Test::More tests => 62;
+use Test::More tests => 66;
 use Test::Deep qw(cmp_deeply re);
 
 use Digest::MD5;
@@ -327,3 +327,18 @@ is_deeply(opustags(qw(--vendor gobble.opus)), ["Lavf58.12.100\n", '', 0], 'print
 is_deeply(opustags(qw(--set-vendor opustags gobble.opus -o out.opus)), ['', '', 0], 'set the vendor string');
 is_deeply(opustags(qw(--vendor out.opus)), ["opustags\n", '', 0], 'the vendor string was updated');
 unlink('out.opus');
+
+####################################################################################################
+# Multi-line tags
+
+is_deeply(opustags(qw(--set-all gobble.opus -o out.opus), { in => "MULTILINE=one\n\ttwo\nSIMPLE=three\n" }), ['', '', 0], 'parses continuation lines');
+is_deeply(opustags(qw(out.opus -z)), ["MULTILINE=one\ntwo\0SIMPLE=three\0", '', 0], 'delimits output with NUL on -z');
+unlink('out.opus');
+
+is_deeply(opustags(qw(--set-all gobble.opus -o out.opus -z), { in => "MULTILINE=one\ntwo\0SIMPLE=three\0" }), ['', '', 0], 'delimits input with NUL on -z');
+is_deeply(opustags(qw(out.opus)), [<<'END', '', 0], 'indents continuation lines');
+MULTILINE=one
+	two
+SIMPLE=three
+END
+unlink('out.opus');