From 0067162ffb1db4137fffbca01c0b3ed6a310891d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Mangano?= Date: Tue, 30 Apr 2024 12:17:00 +0900 Subject: [PATCH] Support NUL delimiters with -z --- README.md | 1 + opustags.1 | 22 +++++++++++++++++----- src/cli.cc | 50 +++++++++++++++++++++++++++----------------------- src/opustags.h | 11 +++++++++-- t/cli.cc | 4 +++- t/opustags.t | 17 ++++++++++++++++- 6 files changed, 73 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 5ad1e18..39e0e4a 100644 --- a/README.md +++ b/README.md @@ -69,5 +69,6 @@ Documentation --vendor print the vendor string --set-vendor VALUE set the vendor string --raw disable encoding conversion + -z delimit tags with NUL See the man page, `opustags.1`, for extensive documentation. diff --git a/opustags.1 b/opustags.1 index 27fa9f0..0ee39f9 100644 --- a/opustags.1 +++ b/opustags.1 @@ -1,4 +1,4 @@ -.TH opustags 1 "March 2023" "@PROJECT_NAME@ @PROJECT_VERSION@" +.TH opustags 1 "April 2024" "@PROJECT_NAME@ @PROJECT_VERSION@" .SH NAME opustags \- Ogg Opus tag editor .SH SYNOPSIS @@ -135,6 +135,14 @@ corrupted or possibly even contain intentional binary data. In that case, --raw kind of binary data without ensuring the validity of the tags encoding. This option may also be useful when your system encoding is different from UTF-8 and you wish to preserve the full UTF-8 character set even though your system cannot display it. +.TP +.B \-z +When editing tags programmatically with line-based tools like grep or sed, tags containing newlines +are likely to corrupt the result because these tools won’t interpret multi-line tags as a whole. To +make automatic processing easier, \fB-z\fP delimits tags by a null byte (ASCII NUL) instead of line +feeds. That same \fB-z\fP flag is also supported by GNU grep or GNU sed and, combined with opustags +-z, would make them process the input tag-by-tag instead of line-by-line, thus supporting multi-line +tags as well. This option also disables the TAB prefix for continuation lines after a line feed. .SH EXAMPLES .PP List all the tags in file foo.opus: @@ -145,10 +153,6 @@ Copy in.opus to out.opus, with the TITLE tag added: .PP opustags in.opus --output out.opus --add "TITLE=Hello world!" .PP -Replace all the tags in dest.opus with the ones from src.opus: -.PP - opustags src.opus | opustags --in-place dest.opus --set-all -.PP Remove the previously existing ARTIST tags and add the two X and Y ARTIST tags, then display the new tags without writing them to the Opus file: .PP @@ -157,6 +161,14 @@ tags without writing them to the Opus file: Edit tags interactively in Vim: .PP EDITOR=vim opustags --in-place --edit file.opus +.PP +Replace all the tags in dest.opus with the ones from src.opus: +.PP + opustags src.opus | opustags --in-place dest.opus --set-all +.PP +Use GNU grep to remove all the CHAPTER* tags, with -z to support multi-line tags: +.PP + opustags -z file.opus | grep -z -v ^CHAPTER | opustags -z --in-place file.opus --set-all .SH CAVEATS .PP \fBopustags\fP currently has the following limitations: diff --git a/src/cli.cc b/src/cli.cc index 7ba41f8..40bc10b 100644 --- a/src/cli.cc +++ b/src/cli.cc @@ -41,6 +41,7 @@ Options: --vendor print the vendor string --set-vendor VALUE set the vendor string --raw disable encoding conversion + -z delimit tags with NUL See the man page for extensive documentation. )raw"; @@ -79,7 +80,7 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input) throw status {st::bad_arguments, "No arguments specified. Use -h for help."}; int c; optind = 0; - while ((c = getopt_long(argc, argv, ":ho:iyd:a:s:DSe", getopt_options, NULL)) != -1) { + while ((c = getopt_long(argc, argv, ":ho:iyd:a:s:DSez", getopt_options, NULL)) != -1) { switch (c) { case 'h': opt.print_help = true; @@ -139,6 +140,9 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input) case 'r': opt.raw = true; break; + case 'z': + opt.tag_delimiter = '\0'; + break; case ':': throw status {st::bad_arguments, "Missing value for option '"s + argv[optind - 1] + "'."}; default: @@ -226,17 +230,17 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input) if (set_all) { // Read comments from stdin and prepend them to opt.to_add. - std::list comments = read_comments(comments_input, opt.raw); + std::list comments = read_comments(comments_input, opt); opt.to_add.splice(opt.to_add.begin(), std::move(comments)); } return opt; } /** Format a UTF-8 string by adding tabulations (\t) after line feeds (\n) to mark continuation for - * multiline values. */ -static std::u8string format_value(const std::u8string& source) + * multiline values. With -z, this behavior applies for embedded NUL characters instead of LF. */ +static std::u8string format_value(const std::u8string& source, const ot::options& opt) { - auto newline_count = std::count(source.begin(), source.end(), u8'\n'); + auto newline_count = std::count(source.begin(), source.end(), opt.tag_delimiter); // General case: the value fits on a single line. Use std::string’s copy constructor for the // most efficient copy we could hope for. @@ -247,7 +251,7 @@ static std::u8string format_value(const std::u8string& source) formatted.reserve(source.size() + newline_count); for (auto c : source) { formatted.push_back(c); - if (c == '\n') + if (c == opt.tag_delimiter) formatted.push_back(u8'\t'); } return formatted; @@ -257,9 +261,9 @@ static std::u8string format_value(const std::u8string& source) * Convert the comment from UTF-8 to the system encoding if relevant, and print it with a trailing * line feed. */ -static void puts_utf8(std::u8string_view str, FILE* output, bool raw) +static void puts_utf8(std::u8string_view str, FILE* output, const ot::options& opt) { - if (raw) { + if (opt.raw) { fwrite(str.data(), 1, str.size(), output); } else { try { @@ -270,7 +274,7 @@ static void puts_utf8(std::u8string_view str, FILE* output, bool raw) throw; } } - putc('\n', output); + putc(opt.tag_delimiter, output); } /** @@ -279,7 +283,7 @@ static void puts_utf8(std::u8string_view str, FILE* output, bool raw) * To disambiguate between a newline embedded in a comment and a newline representing the start of * the next tag, continuation lines always have a single TAB (^I) character added to the beginning. */ -void ot::print_comments(const std::list& comments, FILE* output, bool raw) +void ot::print_comments(const std::list& comments, FILE* output, const ot::options& opt) { bool has_control = false; for (const std::u8string& source_comment : comments) { @@ -291,14 +295,14 @@ void ot::print_comments(const std::list& comments, FILE* output, } } } - std::u8string utf8_comment = format_value(source_comment); - puts_utf8(utf8_comment, output, raw); + std::u8string utf8_comment = format_value(source_comment, opt); + puts_utf8(utf8_comment, output, opt); } if (has_control) fputs("warning: Some tags contain control characters.\n", stderr); } -std::list ot::read_comments(FILE* input, bool raw) +std::list ot::read_comments(FILE* input, const ot::options& opt) { std::list comments; comments.clear(); @@ -306,12 +310,12 @@ std::list ot::read_comments(FILE* input, bool raw) size_t buflen = 0; ssize_t nread; std::u8string* previous_comment = nullptr; - while ((nread = getline(&source_line, &buflen, input)) != -1) { - if (nread > 0 && source_line[nread - 1] == '\n') + while ((nread = getdelim(&source_line, &buflen, opt.tag_delimiter, input)) != -1) { + if (nread > 0 && source_line[nread - 1] == opt.tag_delimiter) --nread; // Chomp. std::u8string line; - if (raw) { + if (opt.raw) { line = std::u8string(reinterpret_cast(source_line), nread); } else { try { @@ -335,7 +339,7 @@ std::list ot::read_comments(FILE* input, bool raw) free(source_line); throw rc; } else { - line[0] = '\n'; + line[0] = opt.tag_delimiter; previous_comment->append(line); } } else if (line.find(u8'=') == decltype(line)::npos) { @@ -391,7 +395,7 @@ static void edit_tags(ot::opus_tags& tags, const ot::options& opt) } /** Spawn VISUAL or EDITOR to edit the given tags. */ -static void edit_tags_interactively(ot::opus_tags& tags, const std::optional& base_path, bool raw) +static void edit_tags_interactively(ot::opus_tags& tags, const std::optional& base_path, const ot::options& opt) { const char* editor = nullptr; if (getenv("TERM") != nullptr) @@ -410,7 +414,7 @@ static void edit_tags_interactively(ot::opus_tags& tags, const std::optionalfile); // flush before calling the subprocess - edit_tags_interactively(tags, writer->path, opt.raw); + edit_tags_interactively(tags, writer->path, opt); } auto packet = ot::render_tags(tags); writer->write_header_packet(serialno, pageno, packet); @@ -532,9 +536,9 @@ static void process(ot::ogg_reader& reader, ot::ogg_writer* writer, const ot::op } else { if (opt.cover_out != "-") { if (opt.print_vendor) - puts_utf8(tags.vendor, stdout, opt.raw); + puts_utf8(tags.vendor, stdout, opt); else - ot::print_comments(tags.comments, stdout, opt.raw); + ot::print_comments(tags.comments, stdout, opt); } break; } diff --git a/src/opustags.h b/src/opustags.h index e83ad9f..93c323b 100644 --- a/src/opustags.h +++ b/src/opustags.h @@ -534,6 +534,13 @@ struct options { * extract and set as-is, encoding conversion would get in the way. */ bool raw = false; + /** + * In text mode (default), tags are separated by a line feed. However, when combining + * opustags with grep or other line-based tools, this proves to be a bad separator because + * tag values may contain newlines. Changing the delimiter to '\0' with -z eases the + * processing of multi-line tags with other tools that support null-terminated lines. + */ + char tag_delimiter = '\n'; }; /** @@ -551,13 +558,13 @@ options parse_options(int argc, char** argv, FILE* comments); * * The output generated is meant to be parseable by #ot::read_comments. */ -void print_comments(const std::list& comments, FILE* output, bool raw); +void print_comments(const std::list& comments, FILE* output, const options& opt); /** * Parse the comments outputted by #ot::print_comments. Unless raw is true, the comments are * converted from the system encoding to UTF-8, and returned as UTF-8. */ -std::list read_comments(FILE* input, bool raw); +std::list read_comments(FILE* input, const options& opt); /** * Remove all comments matching the specified selector, which may either be a field name or a diff --git a/t/cli.cc b/t/cli.cc index 0f556e7..3af98cb 100644 --- a/t/cli.cc +++ b/t/cli.cc @@ -5,8 +5,10 @@ static ot::status read_comments(FILE* input, std::list& comments, bool raw) { + ot::options opt; + opt.raw = raw; try { - comments = ot::read_comments(input, raw); + comments = ot::read_comments(input, opt); } catch (const ot::status& rc) { return rc; } diff --git a/t/opustags.t b/t/opustags.t index 039f8f6..2718179 100755 --- a/t/opustags.t +++ b/t/opustags.t @@ -4,7 +4,7 @@ use strict; use warnings; use utf8; -use Test::More tests => 62; +use Test::More tests => 66; use Test::Deep qw(cmp_deeply re); use Digest::MD5; @@ -327,3 +327,18 @@ is_deeply(opustags(qw(--vendor gobble.opus)), ["Lavf58.12.100\n", '', 0], 'print is_deeply(opustags(qw(--set-vendor opustags gobble.opus -o out.opus)), ['', '', 0], 'set the vendor string'); is_deeply(opustags(qw(--vendor out.opus)), ["opustags\n", '', 0], 'the vendor string was updated'); unlink('out.opus'); + +#################################################################################################### +# Multi-line tags + +is_deeply(opustags(qw(--set-all gobble.opus -o out.opus), { in => "MULTILINE=one\n\ttwo\nSIMPLE=three\n" }), ['', '', 0], 'parses continuation lines'); +is_deeply(opustags(qw(out.opus -z)), ["MULTILINE=one\ntwo\0SIMPLE=three\0", '', 0], 'delimits output with NUL on -z'); +unlink('out.opus'); + +is_deeply(opustags(qw(--set-all gobble.opus -o out.opus -z), { in => "MULTILINE=one\ntwo\0SIMPLE=three\0" }), ['', '', 0], 'delimits input with NUL on -z'); +is_deeply(opustags(qw(out.opus)), [<<'END', '', 0], 'indents continuation lines'); +MULTILINE=one + two +SIMPLE=three +END +unlink('out.opus');