diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cbf3e06..493c377 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -60,7 +60,6 @@ More generally, here are a few features that could be added in the future: - Logicial stream listing and selection for multiplexed files. - Escaping control characters with --escape. - Dump binary packets with --binary. -- Skip encoding conversion with --raw. - Edition of the vendor string. - Edition of the arbitrary binary block past the comments. - Support for OpusTags packets spanning multiple pages (> 64 kB). diff --git a/README.md b/README.md index 8325529..1252bac 100644 --- a/README.md +++ b/README.md @@ -62,5 +62,6 @@ Documentation -s, --set FIELD=VALUE replace a comment -S, --set-all import comments from standard input -e, --edit edit tags interactively in VISUAL/EDITOR + --raw disable encoding conversion See the man page, `opustags.1`, for extensive documentation. diff --git a/opustags.1 b/opustags.1 index 72ab0b3..3f2cff7 100644 --- a/opustags.1 +++ b/opustags.1 @@ -103,6 +103,13 @@ Blank lines and lines starting with \fI#\fP are ignored. Edit tags interactively by spawning the program specified by the EDITOR environment variable. The allowed format is the same as \fB--set-all\fP. If TERM and VISUAL are set, VISUAL takes precedence over EDITOR. +.TP +.B \-\-raw +OpusTags metadata should always be encoded in UTF-8, as per RFC 7845. However, some files may be +corrupted or possibly even contain intentional binary data. In that case, --raw lets you edit that +kind of binary data without ensuring the validity of the tags encoding. This option may also be +useful when your system encoding is different from UTF-8 and you wish to preserve the full UTF-8 +character set even though your system cannot display it. .SH EXAMPLES .PP List all the tags in file foo.opus: diff --git a/src/cli.cc b/src/cli.cc index bf080a5..bcbb344 100644 --- a/src/cli.cc +++ b/src/cli.cc @@ -38,6 +38,7 @@ Options: -s, --set FIELD=VALUE replace a comment -S, --set-all import comments from standard input -e, --edit edit tags interactively in VISUAL/EDITOR + --raw disable encoding conversion See the man page for extensive documentation. )raw"; @@ -53,6 +54,7 @@ static struct option getopt_options[] = { {"delete-all", no_argument, 0, 'D'}, {"set-all", no_argument, 0, 'S'}, {"edit", no_argument, 0, 'e'}, + {"raw", no_argument, 0, 'r'}, {NULL, 0, 0, 0} }; @@ -107,6 +109,9 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm case 'e': opt.edit_interactively = true; break; + case 'r': + opt.raw = true; + break; case ':': return {st::bad_arguments, "Missing value for option '"s + argv[optind - 1] + "'."}; @@ -126,12 +131,14 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm } // Convert arguments to UTF-8. - for (std::list* args : { &opt.to_add, &opt.to_delete }) { - for (std::string& arg : *args) { - rc = to_utf8(arg, utf8); - if (rc != ot::st::ok) - return {st::bad_arguments, "Could not encode argument into UTF-8: " + rc.message}; - arg = std::move(utf8); + if (!opt.raw) { + for (std::list* args : { &opt.to_add, &opt.to_delete }) { + for (std::string& arg : *args) { + rc = to_utf8(arg, utf8); + if (rc != ot::st::ok) + return {st::bad_arguments, "Could not encode argument into UTF-8: " + rc.message}; + arg = std::move(utf8); + } } } @@ -159,7 +166,7 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm if (set_all) { // Read comments from stdin and prepend them to opt.to_add. std::list comments; - auto rc = read_comments(comments_input, comments); + auto rc = read_comments(comments_input, comments, opt.raw); if (rc != st::ok) return rc; opt.to_add.splice(opt.to_add.begin(), std::move(comments)); @@ -174,7 +181,7 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm * callers that don’t escape backslashes. Maybe add options to select a mode between simple, * raw, and escaped. */ -void ot::print_comments(const std::list& comments, FILE* output) +void ot::print_comments(const std::list& comments, FILE* output, bool raw) { static ot::encoding_converter from_utf8("UTF-8", "//IGNORE"); std::string local; @@ -182,25 +189,33 @@ void ot::print_comments(const std::list& comments, FILE* output) bool bad_comments = false; bool has_newline = false; bool has_control = false; - for (const std::string& comment : comments) { - ot::status rc = from_utf8(comment, local); - if (rc == ot::st::information_lost) { - info_lost = true; - } else if (rc != ot::st::ok) { - bad_comments = true; - continue; + for (const std::string& utf8_comment : comments) { + const std::string* comment; + // Convert the comment from UTF-8 to the system encoding if relevant. + if (raw) { + comment = &utf8_comment; + } else { + ot::status rc = from_utf8(utf8_comment, local); + comment = &local; + if (rc == ot::st::information_lost) { + info_lost = true; + } else if (rc != ot::st::ok) { + bad_comments = true; + continue; + } } - for (unsigned char c : comment) { + + for (unsigned char c : *comment) { if (c == '\n') has_newline = true; else if (c < 0x20) has_control = true; } - fwrite(local.data(), 1, local.size(), output); + fwrite(comment->data(), 1, comment->size(), output); putc('\n', output); } if (info_lost) - fputs("warning: Some characters are not supported by your system encoding and have been discarded.\n", stderr); + fputs("warning: Some characters could not be converted to your system encoding and have been discarded. See --raw.\n", stderr); if (bad_comments) fputs("warning: Some tags are not properly encoded and have not been displayed.\n", stderr); if (has_newline) @@ -210,7 +225,7 @@ void ot::print_comments(const std::list& comments, FILE* output) fputs("warning: Some tags contain control characters.\n", stderr); } -ot::status ot::read_comments(FILE* input, std::list& comments) +ot::status ot::read_comments(FILE* input, std::list& comments, bool raw) { static ot::encoding_converter to_utf8("", "UTF-8"); comments.clear(); @@ -229,13 +244,17 @@ ot::status ot::read_comments(FILE* input, std::list& comments) free(line); return rc; } - std::string utf8; - ot::status rc = to_utf8(std::string_view(line, nread), utf8); - if (rc == ot::st::ok) { - comments.emplace_back(std::move(utf8)); + if (raw) { + comments.emplace_back(line, nread); } else { - free(line); - return {ot::st::badly_encoded, "UTF-8 conversion error: " + rc.message}; + std::string utf8; + ot::status rc = to_utf8(std::string_view(line, nread), utf8); + if (rc == ot::st::ok) { + comments.emplace_back(std::move(utf8)); + } else { + free(line); + return {ot::st::badly_encoded, "UTF-8 conversion error: " + rc.message}; + } } } free(line); @@ -281,7 +300,7 @@ static ot::status edit_tags(ot::opus_tags& tags, const ot::options& opt) } /** Spawn VISUAL or EDITOR to edit the given tags. */ -static ot::status edit_tags_interactively(ot::opus_tags& tags, const std::optional& base_path) +static ot::status edit_tags_interactively(ot::opus_tags& tags, const std::optional& base_path, bool raw) { const char* editor = nullptr; if (getenv("TERM") != nullptr) @@ -299,7 +318,7 @@ static ot::status edit_tags_interactively(ot::opus_tags& tags, const std::option if (fd == -1 || (tags_file = fdopen(fd, "w")) == nullptr) return {ot::st::standard_error, "Could not open '" + tags_path + "': " + strerror(errno)}; - ot::print_comments(tags.comments, tags_file); + ot::print_comments(tags.comments, tags_file, raw); if (fclose(tags_file) != 0) return {ot::st::standard_error, tags_path + ": fclose error: "s + strerror(errno)}; @@ -328,7 +347,7 @@ static ot::status edit_tags_interactively(ot::opus_tags& tags, const std::option tags_file = fopen(tags_path.c_str(), "re"); if (tags_file == nullptr) return {ot::st::standard_error, "Error opening " + tags_path + ": " + strerror(errno)}; - if ((rc = ot::read_comments(tags_file, tags.comments)) != ot::st::ok) { + if ((rc = ot::read_comments(tags_file, tags.comments, raw)) != ot::st::ok) { fprintf(stderr, "warning: Leaving %s on the disk.\n", tags_path.c_str()); return rc; } @@ -390,7 +409,7 @@ static ot::status process(ot::ogg_reader& reader, ot::ogg_writer* writer, const if (writer) { if (opt.edit_interactively) { fflush(writer->file); // flush before calling the subprocess - if ((rc = edit_tags_interactively(tags, writer->path)) != ot::st::ok) + if ((rc = edit_tags_interactively(tags, writer->path, opt.raw)) != ot::st::ok) return rc; } auto packet = ot::render_tags(tags); @@ -398,7 +417,7 @@ static ot::status process(ot::ogg_reader& reader, ot::ogg_writer* writer, const if (rc != ot::st::ok) return rc; } else { - ot::print_comments(tags.comments, stdout); + ot::print_comments(tags.comments, stdout, opt.raw); break; } } else { diff --git a/src/opustags.h b/src/opustags.h index 995221a..441657e 100644 --- a/src/opustags.h +++ b/src/opustags.h @@ -466,6 +466,12 @@ struct options { * Options: --add, --set, --set-all */ std::list to_add; + /** + * Disable encoding conversions. OpusTags are specified to always be encoded as UTF-8, but + * if for some reason a specific file contains binary tags that someone would like to + * extract and set as-is, encoding conversion would get in the way. + */ + bool raw = false; }; /** @@ -484,14 +490,14 @@ status parse_options(int argc, char** argv, options& opt, FILE* comments); * * The output generated is meant to be parseable by #ot::read_comments. */ -void print_comments(const std::list& comments, FILE* output); +void print_comments(const std::list& comments, FILE* output, bool raw); /** * Parse the comments outputted by #ot::print_comments. * * The comments are converted from the system encoding to UTF-8, and returned as UTF-8. */ -status read_comments(FILE* input, std::list& comments); +status read_comments(FILE* input, std::list& comments, bool raw); /** * Remove all comments matching the specified selector, which may either be a field name or a diff --git a/t/cli.cc b/t/cli.cc index 4133812..19153da 100644 --- a/t/cli.cc +++ b/t/cli.cc @@ -12,7 +12,7 @@ void check_read_comments() { std::string txt = "TITLE=a b c\n\nARTIST=X\nArtist=Y\n"s; ot::file input = fmemopen((char*) txt.data(), txt.size(), "r"); - rc = ot::read_comments(input.get(), comments); + rc = ot::read_comments(input.get(), comments, false); if (rc != ot::st::ok) throw failure("could not read comments"); auto&& expected = {"TITLE=a b c", "ARTIST=X", "Artist=Y"}; @@ -22,14 +22,23 @@ void check_read_comments() { std::string txt = "CORRUPTED=\xFF\xFF\n"s; ot::file input = fmemopen((char*) txt.data(), txt.size(), "r"); - rc = ot::read_comments(input.get(), comments); + rc = ot::read_comments(input.get(), comments, false); if (rc != ot::st::badly_encoded) throw failure("did not get the expected error reading corrupted data"); } + { + std::string txt = "RAW=\xFF\xFF\n"s; + ot::file input = fmemopen((char*) txt.data(), txt.size(), "r"); + rc = ot::read_comments(input.get(), comments, true); + if (rc != ot::st::ok) + throw failure("could not read comments"); + if (comments.front() != "RAW=\xFF\xFF") + throw failure("parsed user comments did not match expectations"); + } { std::string txt = "MALFORMED\n"s; ot::file input = fmemopen((char*) txt.data(), txt.size(), "r"); - rc = ot::read_comments(input.get(), comments); + rc = ot::read_comments(input.get(), comments, false); if (rc != ot::st::error) throw failure("did not get the expected error reading malformed comments"); } @@ -90,6 +99,10 @@ void check_good_arguments() if (opt.paths_in.size() != 1 || opt.paths_in[0] != "x" || !opt.edit_interactively || !opt.overwrite || !opt.in_place) throw failure("unexpected option parsing result for case #4"); + + opt = parse({"opustags", "-a", "X=\xFF", "--raw", "x"}); + if (!opt.raw || opt.to_add.front() != "X=\xFF") + throw failure("--raw did not disable transcoding"); } void check_bad_arguments() @@ -139,6 +152,15 @@ void check_bad_arguments() error_case({"opustags", "--edit", "x", "-i", "-d", "X"}, "Cannot mix --edit with -adDsS.", "mixing -e and -d"); error_case({"opustags", "--edit", "x", "-i", "-D"}, "Cannot mix --edit with -adDsS.", "mixing -e and -D"); error_case({"opustags", "--edit", "x", "-i", "-S"}, "Cannot mix --edit with -adDsS.", "mixing -e and -S"); + error_case({"opustags", "-d", "\xFF", "x"}, + "Could not encode argument into UTF-8: Some characters could not be converted into the target encoding.", + "-d with binary data"); + error_case({"opustags", "-a", "X=\xFF", "x"}, + "Could not encode argument into UTF-8: Some characters could not be converted into the target encoding.", + "-a with binary data"); + error_case({"opustags", "-s", "X=\xFF", "x"}, + "Could not encode argument into UTF-8: Some characters could not be converted into the target encoding.", + "-s with binary data"); } static void check_delete_comments() diff --git a/t/opustags.t b/t/opustags.t index 2d10112..fdc5654 100755 --- a/t/opustags.t +++ b/t/opustags.t @@ -4,7 +4,7 @@ use strict; use warnings; use utf8; -use Test::More tests => 47; +use Test::More tests => 50; use Digest::MD5; use File::Basename; @@ -72,6 +72,7 @@ Options: -s, --set FIELD=VALUE replace a comment -S, --set-all import comments from standard input -e, --edit edit tags interactively in VISUAL/EDITOR + --raw disable encoding conversion See the man page for extensive documentation. EOF @@ -279,7 +280,7 @@ TITLE= ARTIST=\xe9\xe0\xe7 I=\xf9\xce END_OUT -warning: Some characters are not supported by your system encoding and have been discarded. +warning: Some characters could not be converted to your system encoding and have been discarded. See --raw. END_ERR $ENV{LC_ALL} = ''; @@ -291,3 +292,23 @@ ARTIST=éàç I=ùÎ END_OUT } + + +#################################################################################################### +# Raw edition + +is_deeply(opustags(qw(-S out.opus -i --raw -a), "U=\xFE", {in => <<"END_IN", mode => ':raw'}), ['', '', 0], 'raw set-all with binary data'); +T=\xFF +END_IN + +is_deeply(opustags(qw(out.opus)), [<<"END_OUT", <<'END_ERR', 0], 'default read with binary data'); +T= +U= +END_OUT +warning: Some characters could not be converted to your system encoding and have been discarded. See --raw. +END_ERR + +is_deeply(opustags(qw(out.opus --raw), { mode => ':raw' }), [<<"END_OUT", '', 0], 'raw read'); +T=\xFF +U=\xFE +END_OUT