Introduce --raw for disabling transcoding

This commit is contained in:
Frédéric Mangano 2020-12-26 13:16:52 +01:00
parent 6db7f07bd5
commit 4cae6c44ee
7 changed files with 113 additions and 38 deletions

View File

@ -60,7 +60,6 @@ More generally, here are a few features that could be added in the future:
- Logicial stream listing and selection for multiplexed files.
- Escaping control characters with --escape.
- Dump binary packets with --binary.
- Skip encoding conversion with --raw.
- Edition of the vendor string.
- Edition of the arbitrary binary block past the comments.
- Support for OpusTags packets spanning multiple pages (> 64 kB).

View File

@ -62,5 +62,6 @@ Documentation
-s, --set FIELD=VALUE replace a comment
-S, --set-all import comments from standard input
-e, --edit edit tags interactively in VISUAL/EDITOR
--raw disable encoding conversion
See the man page, `opustags.1`, for extensive documentation.

View File

@ -103,6 +103,13 @@ Blank lines and lines starting with \fI#\fP are ignored.
Edit tags interactively by spawning the program specified by the EDITOR
environment variable. The allowed format is the same as \fB--set-all\fP.
If TERM and VISUAL are set, VISUAL takes precedence over EDITOR.
.TP
.B \-\-raw
OpusTags metadata should always be encoded in UTF-8, as per RFC 7845. However, some files may be
corrupted or possibly even contain intentional binary data. In that case, --raw lets you edit that
kind of binary data without ensuring the validity of the tags encoding. This option may also be
useful when your system encoding is different from UTF-8 and you wish to preserve the full UTF-8
character set even though your system cannot display it.
.SH EXAMPLES
.PP
List all the tags in file foo.opus:

View File

@ -38,6 +38,7 @@ Options:
-s, --set FIELD=VALUE replace a comment
-S, --set-all import comments from standard input
-e, --edit edit tags interactively in VISUAL/EDITOR
--raw disable encoding conversion
See the man page for extensive documentation.
)raw";
@ -53,6 +54,7 @@ static struct option getopt_options[] = {
{"delete-all", no_argument, 0, 'D'},
{"set-all", no_argument, 0, 'S'},
{"edit", no_argument, 0, 'e'},
{"raw", no_argument, 0, 'r'},
{NULL, 0, 0, 0}
};
@ -107,6 +109,9 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm
case 'e':
opt.edit_interactively = true;
break;
case 'r':
opt.raw = true;
break;
case ':':
return {st::bad_arguments,
"Missing value for option '"s + argv[optind - 1] + "'."};
@ -126,12 +131,14 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm
}
// Convert arguments to UTF-8.
for (std::list<std::string>* args : { &opt.to_add, &opt.to_delete }) {
for (std::string& arg : *args) {
rc = to_utf8(arg, utf8);
if (rc != ot::st::ok)
return {st::bad_arguments, "Could not encode argument into UTF-8: " + rc.message};
arg = std::move(utf8);
if (!opt.raw) {
for (std::list<std::string>* args : { &opt.to_add, &opt.to_delete }) {
for (std::string& arg : *args) {
rc = to_utf8(arg, utf8);
if (rc != ot::st::ok)
return {st::bad_arguments, "Could not encode argument into UTF-8: " + rc.message};
arg = std::move(utf8);
}
}
}
@ -159,7 +166,7 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm
if (set_all) {
// Read comments from stdin and prepend them to opt.to_add.
std::list<std::string> comments;
auto rc = read_comments(comments_input, comments);
auto rc = read_comments(comments_input, comments, opt.raw);
if (rc != st::ok)
return rc;
opt.to_add.splice(opt.to_add.begin(), std::move(comments));
@ -174,7 +181,7 @@ ot::status ot::parse_options(int argc, char** argv, ot::options& opt, FILE* comm
* callers that dont escape backslashes. Maybe add options to select a mode between simple,
* raw, and escaped.
*/
void ot::print_comments(const std::list<std::string>& comments, FILE* output)
void ot::print_comments(const std::list<std::string>& comments, FILE* output, bool raw)
{
static ot::encoding_converter from_utf8("UTF-8", "//IGNORE");
std::string local;
@ -182,25 +189,33 @@ void ot::print_comments(const std::list<std::string>& comments, FILE* output)
bool bad_comments = false;
bool has_newline = false;
bool has_control = false;
for (const std::string& comment : comments) {
ot::status rc = from_utf8(comment, local);
if (rc == ot::st::information_lost) {
info_lost = true;
} else if (rc != ot::st::ok) {
bad_comments = true;
continue;
for (const std::string& utf8_comment : comments) {
const std::string* comment;
// Convert the comment from UTF-8 to the system encoding if relevant.
if (raw) {
comment = &utf8_comment;
} else {
ot::status rc = from_utf8(utf8_comment, local);
comment = &local;
if (rc == ot::st::information_lost) {
info_lost = true;
} else if (rc != ot::st::ok) {
bad_comments = true;
continue;
}
}
for (unsigned char c : comment) {
for (unsigned char c : *comment) {
if (c == '\n')
has_newline = true;
else if (c < 0x20)
has_control = true;
}
fwrite(local.data(), 1, local.size(), output);
fwrite(comment->data(), 1, comment->size(), output);
putc('\n', output);
}
if (info_lost)
fputs("warning: Some characters are not supported by your system encoding and have been discarded.\n", stderr);
fputs("warning: Some characters could not be converted to your system encoding and have been discarded. See --raw.\n", stderr);
if (bad_comments)
fputs("warning: Some tags are not properly encoded and have not been displayed.\n", stderr);
if (has_newline)
@ -210,7 +225,7 @@ void ot::print_comments(const std::list<std::string>& comments, FILE* output)
fputs("warning: Some tags contain control characters.\n", stderr);
}
ot::status ot::read_comments(FILE* input, std::list<std::string>& comments)
ot::status ot::read_comments(FILE* input, std::list<std::string>& comments, bool raw)
{
static ot::encoding_converter to_utf8("", "UTF-8");
comments.clear();
@ -229,13 +244,17 @@ ot::status ot::read_comments(FILE* input, std::list<std::string>& comments)
free(line);
return rc;
}
std::string utf8;
ot::status rc = to_utf8(std::string_view(line, nread), utf8);
if (rc == ot::st::ok) {
comments.emplace_back(std::move(utf8));
if (raw) {
comments.emplace_back(line, nread);
} else {
free(line);
return {ot::st::badly_encoded, "UTF-8 conversion error: " + rc.message};
std::string utf8;
ot::status rc = to_utf8(std::string_view(line, nread), utf8);
if (rc == ot::st::ok) {
comments.emplace_back(std::move(utf8));
} else {
free(line);
return {ot::st::badly_encoded, "UTF-8 conversion error: " + rc.message};
}
}
}
free(line);
@ -281,7 +300,7 @@ static ot::status edit_tags(ot::opus_tags& tags, const ot::options& opt)
}
/** Spawn VISUAL or EDITOR to edit the given tags. */
static ot::status edit_tags_interactively(ot::opus_tags& tags, const std::optional<std::string>& base_path)
static ot::status edit_tags_interactively(ot::opus_tags& tags, const std::optional<std::string>& base_path, bool raw)
{
const char* editor = nullptr;
if (getenv("TERM") != nullptr)
@ -299,7 +318,7 @@ static ot::status edit_tags_interactively(ot::opus_tags& tags, const std::option
if (fd == -1 || (tags_file = fdopen(fd, "w")) == nullptr)
return {ot::st::standard_error,
"Could not open '" + tags_path + "': " + strerror(errno)};
ot::print_comments(tags.comments, tags_file);
ot::print_comments(tags.comments, tags_file, raw);
if (fclose(tags_file) != 0)
return {ot::st::standard_error, tags_path + ": fclose error: "s + strerror(errno)};
@ -328,7 +347,7 @@ static ot::status edit_tags_interactively(ot::opus_tags& tags, const std::option
tags_file = fopen(tags_path.c_str(), "re");
if (tags_file == nullptr)
return {ot::st::standard_error, "Error opening " + tags_path + ": " + strerror(errno)};
if ((rc = ot::read_comments(tags_file, tags.comments)) != ot::st::ok) {
if ((rc = ot::read_comments(tags_file, tags.comments, raw)) != ot::st::ok) {
fprintf(stderr, "warning: Leaving %s on the disk.\n", tags_path.c_str());
return rc;
}
@ -390,7 +409,7 @@ static ot::status process(ot::ogg_reader& reader, ot::ogg_writer* writer, const
if (writer) {
if (opt.edit_interactively) {
fflush(writer->file); // flush before calling the subprocess
if ((rc = edit_tags_interactively(tags, writer->path)) != ot::st::ok)
if ((rc = edit_tags_interactively(tags, writer->path, opt.raw)) != ot::st::ok)
return rc;
}
auto packet = ot::render_tags(tags);
@ -398,7 +417,7 @@ static ot::status process(ot::ogg_reader& reader, ot::ogg_writer* writer, const
if (rc != ot::st::ok)
return rc;
} else {
ot::print_comments(tags.comments, stdout);
ot::print_comments(tags.comments, stdout, opt.raw);
break;
}
} else {

View File

@ -466,6 +466,12 @@ struct options {
* Options: --add, --set, --set-all
*/
std::list<std::string> to_add;
/**
* Disable encoding conversions. OpusTags are specified to always be encoded as UTF-8, but
* if for some reason a specific file contains binary tags that someone would like to
* extract and set as-is, encoding conversion would get in the way.
*/
bool raw = false;
};
/**
@ -484,14 +490,14 @@ status parse_options(int argc, char** argv, options& opt, FILE* comments);
*
* The output generated is meant to be parseable by #ot::read_comments.
*/
void print_comments(const std::list<std::string>& comments, FILE* output);
void print_comments(const std::list<std::string>& comments, FILE* output, bool raw);
/**
* Parse the comments outputted by #ot::print_comments.
*
* The comments are converted from the system encoding to UTF-8, and returned as UTF-8.
*/
status read_comments(FILE* input, std::list<std::string>& comments);
status read_comments(FILE* input, std::list<std::string>& comments, bool raw);
/**
* Remove all comments matching the specified selector, which may either be a field name or a

View File

@ -12,7 +12,7 @@ void check_read_comments()
{
std::string txt = "TITLE=a b c\n\nARTIST=X\nArtist=Y\n"s;
ot::file input = fmemopen((char*) txt.data(), txt.size(), "r");
rc = ot::read_comments(input.get(), comments);
rc = ot::read_comments(input.get(), comments, false);
if (rc != ot::st::ok)
throw failure("could not read comments");
auto&& expected = {"TITLE=a b c", "ARTIST=X", "Artist=Y"};
@ -22,14 +22,23 @@ void check_read_comments()
{
std::string txt = "CORRUPTED=\xFF\xFF\n"s;
ot::file input = fmemopen((char*) txt.data(), txt.size(), "r");
rc = ot::read_comments(input.get(), comments);
rc = ot::read_comments(input.get(), comments, false);
if (rc != ot::st::badly_encoded)
throw failure("did not get the expected error reading corrupted data");
}
{
std::string txt = "RAW=\xFF\xFF\n"s;
ot::file input = fmemopen((char*) txt.data(), txt.size(), "r");
rc = ot::read_comments(input.get(), comments, true);
if (rc != ot::st::ok)
throw failure("could not read comments");
if (comments.front() != "RAW=\xFF\xFF")
throw failure("parsed user comments did not match expectations");
}
{
std::string txt = "MALFORMED\n"s;
ot::file input = fmemopen((char*) txt.data(), txt.size(), "r");
rc = ot::read_comments(input.get(), comments);
rc = ot::read_comments(input.get(), comments, false);
if (rc != ot::st::error)
throw failure("did not get the expected error reading malformed comments");
}
@ -90,6 +99,10 @@ void check_good_arguments()
if (opt.paths_in.size() != 1 || opt.paths_in[0] != "x" ||
!opt.edit_interactively || !opt.overwrite || !opt.in_place)
throw failure("unexpected option parsing result for case #4");
opt = parse({"opustags", "-a", "X=\xFF", "--raw", "x"});
if (!opt.raw || opt.to_add.front() != "X=\xFF")
throw failure("--raw did not disable transcoding");
}
void check_bad_arguments()
@ -139,6 +152,15 @@ void check_bad_arguments()
error_case({"opustags", "--edit", "x", "-i", "-d", "X"}, "Cannot mix --edit with -adDsS.", "mixing -e and -d");
error_case({"opustags", "--edit", "x", "-i", "-D"}, "Cannot mix --edit with -adDsS.", "mixing -e and -D");
error_case({"opustags", "--edit", "x", "-i", "-S"}, "Cannot mix --edit with -adDsS.", "mixing -e and -S");
error_case({"opustags", "-d", "\xFF", "x"},
"Could not encode argument into UTF-8: Some characters could not be converted into the target encoding.",
"-d with binary data");
error_case({"opustags", "-a", "X=\xFF", "x"},
"Could not encode argument into UTF-8: Some characters could not be converted into the target encoding.",
"-a with binary data");
error_case({"opustags", "-s", "X=\xFF", "x"},
"Could not encode argument into UTF-8: Some characters could not be converted into the target encoding.",
"-s with binary data");
}
static void check_delete_comments()

View File

@ -4,7 +4,7 @@ use strict;
use warnings;
use utf8;
use Test::More tests => 47;
use Test::More tests => 50;
use Digest::MD5;
use File::Basename;
@ -72,6 +72,7 @@ Options:
-s, --set FIELD=VALUE replace a comment
-S, --set-all import comments from standard input
-e, --edit edit tags interactively in VISUAL/EDITOR
--raw disable encoding conversion
See the man page for extensive documentation.
EOF
@ -279,7 +280,7 @@ TITLE=
ARTIST=\xe9\xe0\xe7
I=\xf9\xce
END_OUT
warning: Some characters are not supported by your system encoding and have been discarded.
warning: Some characters could not be converted to your system encoding and have been discarded. See --raw.
END_ERR
$ENV{LC_ALL} = '';
@ -291,3 +292,23 @@ ARTIST=éàç
I=ùÎ
END_OUT
}
####################################################################################################
# Raw edition
is_deeply(opustags(qw(-S out.opus -i --raw -a), "U=\xFE", {in => <<"END_IN", mode => ':raw'}), ['', '', 0], 'raw set-all with binary data');
T=\xFF
END_IN
is_deeply(opustags(qw(out.opus)), [<<"END_OUT", <<'END_ERR', 0], 'default read with binary data');
T=
U=
END_OUT
warning: Some characters could not be converted to your system encoding and have been discarded. See --raw.
END_ERR
is_deeply(opustags(qw(out.opus --raw), { mode => ':raw' }), [<<"END_OUT", '', 0], 'raw read');
T=\xFF
U=\xFE
END_OUT