Move text.utf8 to utf8.char

This commit is contained in:
Lars Mueller 2022-09-09 14:17:18 +02:00
parent 341dd5851b
commit 825599f2d0
6 changed files with 31 additions and 26 deletions

@ -5,7 +5,7 @@ local res, code = https.request"https://html.spec.whatwg.org/entities.json"
assert(code == 200) assert(code == 200)
local entity_map = {} local entity_map = {}
for entity, chars in pairs(assert(modlib.json:read_string(res))) do for entity, chars in pairs(assert(modlib.json:read_string(res))) do
entity_map[entity:sub(2, #entity - 1)] = table.concat(modlib.table.map(chars.codepoints, modlib.text.utf8)) entity_map[entity:sub(2, #entity - 1)] = table.concat(modlib.table.map(chars.codepoints, modlib.utf8.char))
end end
local entries = {} local entries = {}
for entity, chars in pairs(entity_map) do for entity, chars in pairs(entity_map) do

@ -45,6 +45,7 @@ for _, file in pairs{
"table", "table",
"vararg", "vararg",
"text", "text",
"utf8",
"vector", "vector",
"quaternion", "quaternion",
"trie", "trie",

@ -101,7 +101,7 @@ for i = 0, 5 do
end end
-- TODO SAX vs DOM -- TODO SAX vs DOM
local utf8 = modlib.text.utf8 local utf8_char = modlib.utf8.char
function read(self, read_) function read(self, read_)
local index = 0 local index = 0
local char local char
@ -149,7 +149,7 @@ function read(self, read_)
end end
end end
local function utf8_codepoint(codepoint) local function utf8_codepoint(codepoint)
return syntax_assert(utf8(codepoint), "invalid codepoint") return syntax_assert(utf8_char(codepoint), "invalid codepoint")
end end
local function string() local function string()
local chars = {} local chars = {}

@ -138,27 +138,6 @@ magic_charset = "[" .. table.concat(magic_charset) .. "]"
function escape_magic_chars(text) return text:gsub("(" .. magic_charset .. ")", "%%%1") end function escape_magic_chars(text) return text:gsub("(" .. magic_charset .. ")", "%%%1") end
function utf8(number)
if number <= 0x007F then
-- Single byte
return string.char(number)
end
if number < 0x00A0 or number > 0x10FFFF then
-- Out of range
return
end
local result = ""
local i = 0
while true do
local remainder = number % 64
result = string.char(128 + remainder) .. result
number = (number - remainder) / 64
i = i + 1
if number <= 2 ^ (8 - i - 2) then break end
end
return string.char(256 - 2 ^ (8 - i - 1) + number) .. result
end
local keywords = modlib.table.set{"and", "break", "do", "else", "elseif", "end", "false", "for", "function", "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while"} local keywords = modlib.table.set{"and", "break", "do", "else", "elseif", "end", "false", "for", "function", "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while"}
keywords["goto"] = true -- Lua 5.2 (LuaJIT) support keywords["goto"] = true -- Lua 5.2 (LuaJIT) support

25
utf8.lua Normal file

@ -0,0 +1,25 @@
local string_char = string.char
local utf8 = {}
function utf8.char(codepoint)
if codepoint <= 0x007F then
-- Single byte
return string_char(codepoint)
end
if codepoint < 0x00A0 or codepoint > 0x10FFFF then
-- Out of range
return -- TODO (?) error instead
end
local result = ""
local i = 0
repeat
local remainder = codepoint % 64
result = string_char(128 + remainder) .. result
codepoint = (codepoint - remainder) / 64
i = i + 1
until codepoint <= 2 ^ (8 - i - 2)
return string_char(0x100 - 2 ^ (8 - i - 1) + codepoint) .. result
end
return utf8

@ -6,8 +6,8 @@ local html = setmetatable({}, {__index = function(self, key)
local function unescape(text) local function unescape(text)
return text return text
:gsub("&([A-Za-z]+);", named_entities) -- named :gsub("&([A-Za-z]+);", named_entities) -- named
:gsub("&#(%d+);", function(digits) return modlib.text.utf8(tonumber(digits)) end) -- decimal :gsub("&#(%d+);", function(digits) return modlib.utf8.char(tonumber(digits)) end) -- decimal
:gsub("&#x(%x+);", function(digits) return modlib.text.utf8(tonumber(digits, 16)) end) -- hex :gsub("&#x(%x+);", function(digits) return modlib.utf8.char(tonumber(digits, 16)) end) -- hex
end end
self.unescape = unescape self.unescape = unescape
return unescape return unescape