From 9664be450eb8682d91bf5e9d2b7ae80fb639306c Mon Sep 17 00:00:00 2001
From: Lars Mueller <appgurulars@gmx.de>
Date: Thu, 16 Sep 2021 19:30:24 +0200
Subject: [PATCH] Fix JSON reader surrogate pair handling

---
 json.lua | 38 ++++++++++++++++++++++++++++++++------
 test.lua | 17 ++++++++++++++++-
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/json.lua b/json.lua
index 5a1d72d..0d602fc 100644
--- a/json.lua
+++ b/json.lua
@@ -5,6 +5,7 @@ local _ENV = {}
 setfenv(1, _ENV)
 
 --! experimental
+-- See https://tools.ietf.org/id/draft-ietf-json-rfc4627bis-09.html#unichars and https://json.org
 
 -- Null
 -- TODO consider using userdata (for ex. by using newproxy)
@@ -100,6 +101,7 @@ for i = 0, 5 do
 end
 
 -- TODO SAX vs DOM
+local utf8 = modlib.text.utf8
 function read(self, read_)
 	local index = 0
 	local char
@@ -146,27 +148,51 @@ function read(self, read_)
 			read()
 		end
 	end
+	local function utf8_codepoint(codepoint)
+		return syntax_assert(utf8(codepoint), "invalid codepoint")
+	end
 	local function string()
 		local chars = {}
+		local high_surrogate
 		while true do
+			local string_char, next_high_surrogate
 			if char == '"' then
+				if high_surrogate then
+					table_insert(chars, utf8_codepoint(high_surrogate))
+				end
 				return table_concat(chars)
 			end
 			if char == "\\" then
 				read()
 				if char == "u" then
-					local num = 0
+					local codepoint = 0
 					for i = 3, 0, -1 do
-						num = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + num
+						codepoint = syntax_assert(hex_digit_values[read()], "expected a hex digit") * (16 ^ i) + codepoint
+					end
+					if high_surrogate and codepoint >= 0xDC00 and codepoint <= 0xDFFF then
+						-- TODO strict mode: throw an error for single surrogates
+						codepoint = 0x10000 + (high_surrogate - 0xD800) * 0x400 + codepoint - 0xDC00
+						-- Don't write the high surrogate
+						high_surrogate = nil
+					end
+					if codepoint >= 0xD800 and codepoint <= 0xDBFF then
+						next_high_surrogate = codepoint
+					else
+						string_char = utf8_codepoint(codepoint)
 					end
-					table_insert(chars, syntax_assert(modlib.text.utf8(num), "invalid codepoint"))
 				else
-					table_insert(chars, syntax_assert(decoding_escapes[char], "invalid escape sequence"))
+					string_char = syntax_assert(decoding_escapes[char], "invalid escape sequence")
 				end
 			else
-				syntax_assert(char, "unclosed string")
 				-- TODO check whether the character is one that must be escaped ("strict" mode)
-				table_insert(chars, char)
+				string_char = syntax_assert(char, "unclosed string")
+			end
+			if high_surrogate then
+				table_insert(chars, utf8_codepoint(high_surrogate))
+			end
+			high_surrogate = next_high_surrogate
+			if string_char then
+				table_insert(chars, string_char)
 			end
 			read()
 		end
diff --git a/test.lua b/test.lua
index 66e588a..945b97e 100644
--- a/test.lua
+++ b/test.lua
@@ -282,7 +282,22 @@ do
 		return json:read_string(json:write_string(object))
 	end)
 	-- Verify spacing is accepted
-	assert(modlib.table.equals_noncircular(json:read_string'\t\t\n{ "a"   : 1, \t"b":2, "c" : [ 1, 2 ,3  ]   }  \n\r\t', {a = 1, b = 2, c = {1, 2, 3}}))
+	assert(table.equals_noncircular(json:read_string'\t\t\n{ "a"   : 1, \t"b":2, "c" : [ 1, 2 ,3  ]   }  \n\r\t', {a = 1, b = 2, c = {1, 2, 3}}))
+	-- Simple surrogate pair tests
+	for _, prefix in pairs{"x", ""} do
+		for _, suffix in pairs{"x", ""} do
+			local function test(str, expected_str)
+				if type(expected_str) == "number" then
+					expected_str = text.utf8(expected_str)
+				end
+				return assert(json:read_string('"' .. prefix .. str .. suffix .. '"') == prefix .. expected_str .. suffix)
+			end
+			test([[\uD834\uDD1E]],  0x1D11E)
+			test([[\uDD1E\uD834]], text.utf8(0xDD1E) .. text.utf8(0xD834))
+			test([[\uD834]], 0xD834)
+			test([[\uDD1E]], 0xDD1E)
+		end
+	end
 end
 
 -- luon