Support multiline tags in non-UTF-8 environments

The \t to \t\n substitution assumed an ASCII-compatible environment and would not have worked under
UTF-16. It’s therefore safer to perform the operation before the encoding conversion.
This commit is contained in:
Frédéric Mangano 2023-02-07 15:24:53 +09:00
parent d95fd45aef
commit 6d6722fb24
2 changed files with 48 additions and 57 deletions

View File

@ -15,8 +15,6 @@
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#include <assert.h>
#include <cctype> // for std::iscntrl
using namespace std::literals::string_literals;
@ -172,77 +170,66 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
return opt;
}
/** Format a UTF-8 string by adding tabulations (\t) after line feeds (\n) to mark continuation for
* multiline values. */
static std::string format_value(const std::string& source)
{
auto newline_count = std::count(source.begin(), source.end(), '\n');
// General case: the value fits on a single line. Use std::strings copy constructor for the
// most efficient copy we could hope for.
if (newline_count == 0)
return source;
std::string formatted;
formatted.reserve(source.size() + newline_count);
for (auto c : source) {
formatted.push_back(c);
if (c == '\n')
formatted.push_back('\t');
}
return formatted;
}
/**
* Print comments in a human readable format that can also be read
* back in by #read_comment.
* Print comments in a human readable format that can also be read back in by #read_comment.
*
* To disambiguate between a newline embedded in a comment and a
* newline representing the start of the next tag, continuation lines
* always have a single TAB (^I) character added to the beginning.
*
* To disambiguate between a newline embedded in a comment and a newline representing the start of
* the next tag, continuation lines always have a single TAB (^I) character added to the beginning.
*/
void ot::print_comments(const std::list<std::string>& comments, FILE* output, bool raw)
{
static ot::encoding_converter from_utf8("UTF-8", "");
std::string local;
bool has_newline = false;
bool has_control = false;
for (const std::string& utf8_comment : comments) {
const std::string* commentp;
for (const std::string& source_comment : comments) {
if (!has_control) { // Dont bother analyzing comments if the flag is already up.
for (unsigned char c : source_comment) {
if (c < 0x20 && c != '\n') {
has_control = true;
break;
}
}
}
std::string utf8_comment = format_value(source_comment);
const std::string* comment;
// Convert the comment from UTF-8 to the system encoding if relevant.
if (raw) {
commentp = &utf8_comment;
comment = &utf8_comment;
} else {
try {
local = from_utf8(utf8_comment);
commentp = &local;
comment = &local;
} catch (ot::status& rc) {
rc.message += " See --raw.";
throw;
}
}
// Check for embedded newlines so we can insert TAB afterward
std::string comment = *commentp;
unsigned int newline_count = 0;
for (int t = 0; t < comment.length(); t++) {
if (comment[t] == '\n') {
newline_count++;
}
else if (std::iscntrl(comment[t])) {
has_control = true;
}
}
// Copy byte by byte into a new string with TAB added after each newline
std::string tabbed_comment;
if (newline_count) {
tabbed_comment.resize(comment.length() + newline_count);
tabbed_comment.reserve( tabbed_comment.size() );
int tabs_done = 0;
for (int t = 0; t < comment.length(); t++) {
tabbed_comment[t + tabs_done] = comment[t];
if (comment[t] == '\n') {
tabs_done++;
tabbed_comment[t+tabs_done] = '\t';
}
}
// Assertion: Inserted as many tabs as newlines were found.
assert(newline_count == tabs_done);
// Assertion: Length of new string is exactly as allocated.
assert(tabbed_comment.length() == comment.length() + newline_count);
fwrite(tabbed_comment.data(), 1, tabbed_comment.size(), output);
}
else {
fwrite(comment.data(), 1, comment.size(), output);
}
fwrite(comment->data(), 1, comment->size(), output);
putc('\n', output);
}
if (has_control)
fputs("warning: Some tags contain control characters.\n", stderr);
}

View File

@ -4,7 +4,7 @@ use strict;
use warnings;
use utf8;
use Test::More tests => 54;
use Test::More tests => 55;
use Digest::MD5;
use File::Basename;
@ -158,14 +158,18 @@ error: Comment does not contain an equal sign: FOO.
EOF
is(md5('out.opus'), '66780307a6081523dc9040f3c47b0448', 'the file did not change');
is_deeply(opustags('out.opus', '-D', '-a', "X=foo\nbar\tquux"), [<<'END_OUT', <<'END_ERR', 0], 'control characters');
X=foo
bar quux
is_deeply(opustags('out.opus', '-D', '-a', "X=foobar\tquux"), [<<'END_OUT', <<'END_ERR', 0], 'control characters');
X=foobar quux
END_OUT
warning: Some tags contain unsupported newline characters.
warning: Some tags contain control characters.
END_ERR
is_deeply(opustags('out.opus', '-D', '-a', "X=foo\n\nbar"), [<<'END_OUT', '', 0], 'newline characters');
X=foo
bar
END_OUT
is_deeply(opustags(qw(-i out.opus -s fatal=yes -s FOO -s BAR)), ['', <<'EOF', 512], 'bad tag with --set');
error: Comment does not contain an equal sign: FOO.
EOF