Fix JSON reader surrogate pair handling

This commit is contained in:
Lars Mueller 2021-09-16 19:30:24 +02:00
parent 2a6846c9a2
commit 9664be450e
2 changed files with 48 additions and 7 deletions

@ -5,6 +5,7 @@ local _ENV = {}
setfenv(1, _ENV)
--! experimental
-- See https://tools.ietf.org/id/draft-ietf-json-rfc4627bis-09.html#unichars and https://json.org
-- Null
-- TODO consider using userdata (for ex. by using newproxy)
@ -100,6 +101,7 @@ for i = 0, 5 do
end
-- TODO SAX vs DOM
local utf8 = modlib.text.utf8
function read(self, read_)
local index = 0
local char
@ -146,27 +148,51 @@ function read(self, read_)
read()
end
end
local function utf8_codepoint(codepoint)
return syntax_assert(utf8(codepoint), "invalid codepoint")
end
local function string()
local chars = {}
local high_surrogate
while true do
local string_char, next_high_surrogate
if char == '"' then
if high_surrogate then
table_insert(chars, utf8_codepoint(high_surrogate))
end
return table_concat(chars)
end
if char == "\\" then
read()
if char == "u" then
local num = 0
local codepoint = 0
for i = 3, 0, -1 do
num = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + num
codepoint = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + codepoint
end
table_insert(chars, syntax_assert(modlib.text.utf8(num), "invalid codepoint"))
if high_surrogate and codepoint >= 0xDC00 and codepoint <= 0xDFFF then
-- TODO strict mode: throw an error for single surrogates
codepoint = 0x10000 + (high_surrogate - 0xD800) * 0x400 + codepoint - 0xDC00
-- Don't write the high surrogate
high_surrogate = nil
end
if codepoint >= 0xD800 and codepoint <= 0xDBFF then
next_high_surrogate = codepoint
else
table_insert(chars, syntax_assert(decoding_escapes[char], "invalid escape sequence"))
string_char = utf8_codepoint(codepoint)
end
else
string_char = syntax_assert(decoding_escapes[char], "invalid escape sequence")
end
else
syntax_assert(char, "unclosed string")
-- TODO check whether the character is one that must be escaped ("strict" mode)
table_insert(chars, char)
string_char = syntax_assert(char, "unclosed string")
end
if high_surrogate then
table_insert(chars, utf8_codepoint(high_surrogate))
end
high_surrogate = next_high_surrogate
if string_char then
table_insert(chars, string_char)
end
read()
end

@ -282,7 +282,22 @@ do
return json:read_string(json:write_string(object))
end)
-- Verify spacing is accepted
assert(modlib.table.equals_noncircular(json:read_string'\t\t\n{ "a" : 1, \t"b":2, "c" : [ 1, 2 ,3 ] } \n\r\t', {a = 1, b = 2, c = {1, 2, 3}}))
assert(table.equals_noncircular(json:read_string'\t\t\n{ "a" : 1, \t"b":2, "c" : [ 1, 2 ,3 ] } \n\r\t', {a = 1, b = 2, c = {1, 2, 3}}))
-- Simple surrogate pair tests
for _, prefix in pairs{"x", ""} do
for _, suffix in pairs{"x", ""} do
local function test(str, expected_str)
if type(expected_str) == "number" then
expected_str = text.utf8(expected_str)
end
return assert(json:read_string('"' .. prefix .. str .. suffix .. '"') == prefix .. expected_str .. suffix)
end
test([[\uD834\uDD1E]], 0x1D11E)
test([[\uDD1E\uD834]], text.utf8(0xDD1E) .. text.utf8(0xD834))
test([[\uD834]], 0xD834)
test([[\uDD1E]], 0xDD1E)
end
end
end
-- luon