Skip to content

Commit

Permalink
AVRO-3860: Fix for wrong encoding of Unicode values above 0xffff (#2831)
Browse files Browse the repository at this point in the history
* Fix for wrong encoding of Unicode values above 0xffff

* More approriate error message with wrong Unicode escapes

* Fixed a subtle bug in detecting surrogate codes of UTF-16 encoding

* Fixed a bug that allowed prhibited unicode values

---------

Co-authored-by: Thiruvalluvan M G <[email protected]>
  • Loading branch information
thiru-mg and Thiruvalluvan M G committed Apr 6, 2024
1 parent cffffe7 commit 00afbae
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 25 deletions.
82 changes: 58 additions & 24 deletions lang/c++/impl/json/JsonIO.cc
Original file line number Diff line number Diff line change
Expand Up @@ -314,19 +314,41 @@ JsonParser::Token JsonParser::tryString() {
}
}


// Decode the given string and return contents as UTF8-encoded bytes.
// The input does not have the enclosing double-quotes.
string JsonParser::decodeString(const string &s, bool binary) {
string result;
const auto readNextByte = [](string::const_iterator &it, const string::const_iterator &end) -> char {
if (it == end)
auto it = s.cbegin();
const auto end = s.cend();
const auto readNextByte = [&]() -> char {
if (it == end) {
throw Exception("Unexpected EOF");
}
return *it++;
};
auto it = s.cbegin();
const auto end = s.cend();
const auto unicodeParse = [&]() {
uint32_t n = 0;
for (int i = 0; i < 4; i++) {
auto c = readNextByte();
n *= 16;
if (isdigit(c)) {
n += c - '0';
} else if (c >= 'a' && c <= 'f') {
n += c - 'a' + 10;
} else if (c >= 'A' && c <= 'F') {
n += c - 'A' + 10;
} else {
throw Exception(boost::format( "Invalid hex character: %1%") % c);
}
}
return n;
};
while (it != end) {
char ch = *it++;
string::const_iterator startSeq = it;
char ch = readNextByte();
if (ch == '\\') {
ch = readNextByte(it, end);
ch = readNextByte();
switch (ch) {
case '"':
case '\\':
Expand All @@ -350,30 +372,42 @@ string JsonParser::decodeString(const string &s, bool binary) {
continue;
case 'u':
case 'U': {
uint32_t n = 0;
char e[4];
for (char &i : e) {
n *= 16;
char c = readNextByte(it, end);
i = c;
if (isdigit(c)) {
n += c - '0';
} else if (c >= 'a' && c <= 'f') {
n += c - 'a' + 10;
} else if (c >= 'A' && c <= 'F') {
n += c - 'A' + 10;
}
}
uint32_t n = unicodeParse();
if (binary) {
if (n > 0xff) {
throw Exception(boost::format(
"Invalid byte for binary: %1%%2%")
% ch % string(e, 4));
% ch % string(startSeq, ++it));
} else {
result.push_back(n);
continue;
}
}
if (n >= 0xd800 && n < 0xdc00) {
ch = readNextByte();
if (ch != '\\') {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, it));
}
ch = readNextByte();
if (ch != 'u' && ch != 'U') {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, it));
}
uint32_t m = unicodeParse();
if (m < 0xdc00 || m > 0xdfff) {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, it));
}
n = 0x10000 + (((n - 0xd800) << 10) | (m - 0xdc00));
} else if (n >= 0xdc00 && n < 0xdfff) {
throw Exception(boost::format(
"Invalid unicode sequence: %1%")
% string(startSeq, it));
}
if (n < 0x80) {
result.push_back(n);
} else if (n < 0x800) {
Expand All @@ -383,15 +417,15 @@ string JsonParser::decodeString(const string &s, bool binary) {
result.push_back((n >> 12) | 0xe0);
result.push_back(((n >> 6) & 0x3f) | 0x80);
result.push_back((n & 0x3f) | 0x80);
} else if (n < 110000) {
} else if (n < 0x110000) {
result.push_back((n >> 18) | 0xf0);
result.push_back(((n >> 12) & 0x3f) | 0x80);
result.push_back(((n >> 6) & 0x3f) | 0x80);
result.push_back((n & 0x3f) | 0x80);
} else {
throw Exception(boost::format(
"Invalid unicode value: %1%i%2%")
% ch % string(e, 4));
"Invalid unicode value: %1%%2%")
% n % string(startSeq, ++it));
}
}
continue;
Expand Down
13 changes: 12 additions & 1 deletion lang/c++/impl/json/JsonIO.hh
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,23 @@ class AVRO_DECL JsonGenerator {
out_.write(toHex((static_cast<unsigned char>(c)) % 16));
}

void escapeUnicode(uint32_t c) {
void escapeUnicode16(uint32_t c) {
out_.write('\\');
out_.write('u');
writeHex((c >> 8) & 0xff);
writeHex(c & 0xff);
}
void escapeUnicode(uint32_t c) {
if (c < 0x10000) {
escapeUnicode16(c);
} else if (c < 0x110000) {
c -= 0x10000;
escapeUnicode16(((c >> 10) & 0x3ff) | 0xd800);
escapeUnicode16((c & 0x3ff) | 0xdc00);
} else {
throw Exception(boost::format("Invalid code-point: %1%") % c);
}
}
void doEncodeString(const char *b, size_t len, bool binary) {
const char *e = b + len;
out_.write('"');
Expand Down
1 change: 1 addition & 0 deletions lang/c++/test/JsonTests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ TestData<const char *> stringData[] = {
{R"("\u20ac")", EntityType::String, "\xe2\x82\xac", R"("\u20ac")"},
{R"("\u03c0")", EntityType::String, "\xcf\x80", R"("\u03c0")"},
{R"("hello\n")", EntityType::String, "hello\n", R"("hello\n")"},
{R"("\Ud8ab\udccd")", EntityType::String, "\xf0\xba\xb3\x8d", R"("\ud8ab\udccd")"},
};

void testBool(const TestData<bool> &d) {
Expand Down

0 comments on commit 00afbae

Please sign in to comment.