From 9664be450eb8682d91bf5e9d2b7ae80fb639306c Mon Sep 17 00:00:00 2001 From: Lars Mueller Date: Thu, 16 Sep 2021 19:30:24 +0200 Subject: [PATCH] Fix JSON reader surrogate pair handling --- json.lua | 38 ++++++++++++++++++++++++++++++++------ test.lua | 17 ++++++++++++++++- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/json.lua b/json.lua index 5a1d72d..0d602fc 100644 --- a/json.lua +++ b/json.lua @@ -5,6 +5,7 @@ local _ENV = {} setfenv(1, _ENV) --! experimental +-- See https://tools.ietf.org/id/draft-ietf-json-rfc4627bis-09.html#unichars and https://json.org -- Null -- TODO consider using userdata (for ex. by using newproxy) @@ -100,6 +101,7 @@ for i = 0, 5 do end -- TODO SAX vs DOM +local utf8 = modlib.text.utf8 function read(self, read_) local index = 0 local char @@ -146,27 +148,51 @@ function read(self, read_) read() end end + local function utf8_codepoint(codepoint) + return syntax_assert(utf8(codepoint), "invalid codepoint") + end local function string() local chars = {} + local high_surrogate while true do + local string_char, next_high_surrogate if char == '"' then + if high_surrogate then + table_insert(chars, utf8_codepoint(high_surrogate)) + end return table_concat(chars) end if char == "\\" then read() if char == "u" then - local num = 0 + local codepoint = 0 for i = 3, 0, -1 do - num = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + num + codepoint = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + codepoint + end + if high_surrogate and codepoint >= 0xDC00 and codepoint <= 0xDFFF then + -- TODO strict mode: throw an error for single surrogates + codepoint = 0x10000 + (high_surrogate - 0xD800) * 0x400 + codepoint - 0xDC00 + -- Don't write the high surrogate + high_surrogate = nil + end + if codepoint >= 0xD800 and codepoint <= 0xDBFF then + next_high_surrogate = codepoint + else + string_char = utf8_codepoint(codepoint) end - table_insert(chars, syntax_assert(modlib.text.utf8(num), "invalid codepoint")) else - table_insert(chars, syntax_assert(decoding_escapes[char], "invalid escape sequence")) + string_char = syntax_assert(decoding_escapes[char], "invalid escape sequence") end else - syntax_assert(char, "unclosed string") -- TODO check whether the character is one that must be escaped ("strict" mode) - table_insert(chars, char) + string_char = syntax_assert(char, "unclosed string") + end + if high_surrogate then + table_insert(chars, utf8_codepoint(high_surrogate)) + end + high_surrogate = next_high_surrogate + if string_char then + table_insert(chars, string_char) end read() end diff --git a/test.lua b/test.lua index 66e588a..945b97e 100644 --- a/test.lua +++ b/test.lua @@ -282,7 +282,22 @@ do return json:read_string(json:write_string(object)) end) -- Verify spacing is accepted - assert(modlib.table.equals_noncircular(json:read_string'\t\t\n{ "a" : 1, \t"b":2, "c" : [ 1, 2 ,3 ] } \n\r\t', {a = 1, b = 2, c = {1, 2, 3}})) + assert(table.equals_noncircular(json:read_string'\t\t\n{ "a" : 1, \t"b":2, "c" : [ 1, 2 ,3 ] } \n\r\t', {a = 1, b = 2, c = {1, 2, 3}})) + -- Simple surrogate pair tests + for _, prefix in pairs{"x", ""} do + for _, suffix in pairs{"x", ""} do + local function test(str, expected_str) + if type(expected_str) == "number" then + expected_str = text.utf8(expected_str) + end + return assert(json:read_string('"' .. prefix .. str .. suffix .. '"') == prefix .. expected_str .. suffix) + end + test([[\uD834\uDD1E]], 0x1D11E) + test([[\uDD1E\uD834]], text.utf8(0xDD1E) .. text.utf8(0xD834)) + test([[\uD834]], 0xD834) + test([[\uDD1E]], 0xDD1E) + end + end end -- luon