From 825599f2d09ec4473f1441e985ab235a604e4799 Mon Sep 17 00:00:00 2001 From: Lars Mueller Date: Fri, 9 Sep 2022 14:17:18 +0200 Subject: [PATCH] Move text.utf8 to utf8.char --- build/html_entities.lua | 2 +- init.lua | 1 + json.lua | 4 ++-- text.lua | 21 --------------------- utf8.lua | 25 +++++++++++++++++++++++++ web/html.lua | 4 ++-- 6 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 utf8.lua diff --git a/build/html_entities.lua b/build/html_entities.lua index 57ebf04..60083f1 100644 --- a/build/html_entities.lua +++ b/build/html_entities.lua @@ -5,7 +5,7 @@ local res, code = https.request"https://html.spec.whatwg.org/entities.json" assert(code == 200) local entity_map = {} for entity, chars in pairs(assert(modlib.json:read_string(res))) do - entity_map[entity:sub(2, #entity - 1)] = table.concat(modlib.table.map(chars.codepoints, modlib.text.utf8)) + entity_map[entity:sub(2, #entity - 1)] = table.concat(modlib.table.map(chars.codepoints, modlib.utf8.char)) end local entries = {} for entity, chars in pairs(entity_map) do diff --git a/init.lua b/init.lua index 755e9d2..c0109cf 100644 --- a/init.lua +++ b/init.lua @@ -45,6 +45,7 @@ for _, file in pairs{ "table", "vararg", "text", + "utf8", "vector", "quaternion", "trie", diff --git a/json.lua b/json.lua index e4f9bb3..1168d07 100644 --- a/json.lua +++ b/json.lua @@ -101,7 +101,7 @@ for i = 0, 5 do end -- TODO SAX vs DOM -local utf8 = modlib.text.utf8 +local utf8_char = modlib.utf8.char function read(self, read_) local index = 0 local char @@ -149,7 +149,7 @@ function read(self, read_) end end local function utf8_codepoint(codepoint) - return syntax_assert(utf8(codepoint), "invalid codepoint") + return syntax_assert(utf8_char(codepoint), "invalid codepoint") end local function string() local chars = {} diff --git a/text.lua b/text.lua index 5574c0f..6a1c859 100644 --- a/text.lua +++ b/text.lua @@ -138,27 +138,6 @@ magic_charset = "[" .. table.concat(magic_charset) .. "]" function escape_magic_chars(text) return text:gsub("(" .. magic_charset .. ")", "%%%1") end -function utf8(number) - if number <= 0x007F then - -- Single byte - return string.char(number) - end - if number < 0x00A0 or number > 0x10FFFF then - -- Out of range - return - end - local result = "" - local i = 0 - while true do - local remainder = number % 64 - result = string.char(128 + remainder) .. result - number = (number - remainder) / 64 - i = i + 1 - if number <= 2 ^ (8 - i - 2) then break end - end - return string.char(256 - 2 ^ (8 - i - 1) + number) .. result -end - local keywords = modlib.table.set{"and", "break", "do", "else", "elseif", "end", "false", "for", "function", "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while"} keywords["goto"] = true -- Lua 5.2 (LuaJIT) support diff --git a/utf8.lua b/utf8.lua new file mode 100644 index 0000000..061298f --- /dev/null +++ b/utf8.lua @@ -0,0 +1,25 @@ +local string_char = string.char + +local utf8 = {} + +function utf8.char(codepoint) + if codepoint <= 0x007F then + -- Single byte + return string_char(codepoint) + end + if codepoint < 0x00A0 or codepoint > 0x10FFFF then + -- Out of range + return -- TODO (?) error instead + end + local result = "" + local i = 0 + repeat + local remainder = codepoint % 64 + result = string_char(128 + remainder) .. result + codepoint = (codepoint - remainder) / 64 + i = i + 1 + until codepoint <= 2 ^ (8 - i - 2) + return string_char(0x100 - 2 ^ (8 - i - 1) + codepoint) .. result +end + +return utf8 \ No newline at end of file diff --git a/web/html.lua b/web/html.lua index f1b9aad..9b141d4 100644 --- a/web/html.lua +++ b/web/html.lua @@ -6,8 +6,8 @@ local html = setmetatable({}, {__index = function(self, key) local function unescape(text) return text :gsub("&([A-Za-z]+);", named_entities) -- named - :gsub("&#(%d+);", function(digits) return modlib.text.utf8(tonumber(digits)) end) -- decimal - :gsub("&#x(%x+);", function(digits) return modlib.text.utf8(tonumber(digits, 16)) end) -- hex + :gsub("&#(%d+);", function(digits) return modlib.utf8.char(tonumber(digits)) end) -- decimal + :gsub("&#x(%x+);", function(digits) return modlib.utf8.char(tonumber(digits, 16)) end) -- hex end self.unescape = unescape return unescape