hypertext: Add support for nested lists

This commit is contained in:
rubenwardy 2023-04-19 19:47:11 +01:00
parent 0a06e41497
commit 2a0545210b
2 changed files with 66 additions and 52 deletions

@ -75,12 +75,17 @@ def test_bullets():
html = """ html = """
<ul> <ul>
<li>One</li> <li>One</li>
<li>two three</li> <li>two three<ul><li>sub one</li><li>sub two</li></ul></li>
<li>four</li> <li>four</li>
</ul> </ul>
""" """
expected = "• One\n• two three\n• four\n" expected = "<img name=blank.png width=16 height=1>• One\n" \
"<img name=blank.png width=16 height=1>• two three\n" \
"<img name=blank.png width=32 height=1>• sub one\n" \
"<img name=blank.png width=32 height=1>• sub two\n\n" \
"<img name=blank.png width=16 height=1>• four\n"
result = html_to_minetest(html) result = html_to_minetest(html)
assert result["body"].strip() == expected.strip() assert result["body"].strip() == expected.strip()

@ -24,52 +24,59 @@ def get_attributes(attrs):
return retval return retval
def make_indent(w):
return f"<img name=blank.png width={16*w} height=1>"
class MinetestHTMLParser(HTMLParser): class MinetestHTMLParser(HTMLParser):
def __init__(self, include_images): def __init__(self, include_images):
super().__init__() super().__init__()
self.include_images = include_images self.include_images = include_images
self.text_buffer = "" self.completed_text = ""
self.has_line_started = False self.current_line = ""
self.links = {} self.links = {}
self.images = {} self.images = {}
self.image_tooltips = {} self.image_tooltips = {}
self.is_preserving = False self.is_preserving = False
self.remove_until = None self.remove_until = None
self.indent_level = 0
def finish_line(self):
self.completed_text += self.current_line.rstrip() + "\n"
self.current_line = ""
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if self.is_preserving or self.remove_until: if self.is_preserving or self.remove_until:
return return
print("OPEN", tag, file=sys.stderr)
self.has_line_started = True
if tag == "p": if tag == "p":
self.has_line_started = False pass
elif tag == "pre": elif tag == "pre":
self.text_buffer += "<code>" self.current_line += "<code>"
self.is_preserving = True self.is_preserving = True
self.has_line_started = False
elif tag == "table": elif tag == "table":
# Tables are currently unsupported and removed # Tables are currently unsupported and removed
self.remove_until = "table" self.remove_until = "table"
self.text_buffer += "<i>(table removed)</i>\n" self.current_line += "<i>(table removed)</i>"
self.finish_line()
elif tag == "br": elif tag == "br":
self.text_buffer += "\n" self.finish_line()
self.has_line_started = False
elif tag == "h1" or tag == "h2": elif tag == "h1" or tag == "h2":
self.text_buffer += "\n<big>" self.finish_line()
self.current_line += "<big>"
elif tag == "h3" or tag == "h4" or tag == "h5": elif tag == "h3" or tag == "h4" or tag == "h5":
self.text_buffer += "\n<b>" self.finish_line()
self.current_line += "<b>"
elif tag == "a": elif tag == "a":
for attr in attrs: for attr in attrs:
if attr[0] == "href": if attr[0] == "href":
name = f"link_{len(self.links)}" name = f"link_{len(self.links)}"
self.links[name] = attr[1] self.links[name] = attr[1]
self.text_buffer += f"<action name={name}><u>" self.current_line += f"<action name={name}><u>"
break break
else: else:
self.text_buffer += "<action><u>" self.current_line += "<action><u>"
elif tag == "img": elif tag == "img":
attr_by_value = get_attributes(attrs) attr_by_value = get_attributes(attrs)
if "src" in attr_by_value and self.include_images: if "src" in attr_by_value and self.include_images:
@ -77,23 +84,29 @@ class MinetestHTMLParser(HTMLParser):
self.images[name] = attr_by_value["src"] self.images[name] = attr_by_value["src"]
width = attr_by_value.get("width", 128) width = attr_by_value.get("width", 128)
height = attr_by_value.get("height", 128) height = attr_by_value.get("height", 128)
self.text_buffer += f"<img name={name} width={width} height={height}>" self.current_line += f"<img name={name} width={width} height={height}>"
if "alt" in attr_by_value: if "alt" in attr_by_value:
self.image_tooltips[name] = attr_by_value["alt"] self.image_tooltips[name] = attr_by_value["alt"]
elif tag == "b" or tag == "strong": elif tag == "b" or tag == "strong":
self.text_buffer += "<b>" self.current_line += "<b>"
elif tag == "i" or tag == "em": elif tag == "i" or tag == "em":
self.text_buffer += "<i>" self.current_line += "<i>"
elif tag == "u": elif tag == "u":
self.text_buffer += "<u>" self.current_line += "<u>"
elif tag == "li": elif tag == "li":
self.has_line_started = False if self.current_line.strip() != "":
self.text_buffer += "" self.finish_line()
else:
self.current_line = ""
self.current_line += make_indent(self.indent_level) + ""
elif tag == "code": elif tag == "code":
self.text_buffer += "<code>" self.current_line += "<code>"
elif tag == "span" or tag == "ul": elif tag == "span":
pass pass
elif tag == "ul":
self.indent_level += 1
else: else:
print("UNKNOWN TAG ", tag, attrs, file=sys.stderr) print("UNKNOWN TAG ", tag, attrs, file=sys.stderr)
@ -103,52 +116,46 @@ class MinetestHTMLParser(HTMLParser):
self.remove_until = None self.remove_until = None
return return
print("CLOSE", tag, file=sys.stderr)
if tag == "pre": if tag == "pre":
self.text_buffer = self.text_buffer.rstrip() self.current_line = self.current_line.rstrip() + "</code>"
self.text_buffer += "</code>\n" self.finish_line()
self.is_preserving = False self.is_preserving = False
self.has_line_started = False
elif self.is_preserving: elif self.is_preserving:
return return
elif tag == "p": elif tag == "p":
self.text_buffer = self.text_buffer.rstrip() self.current_line = self.current_line.rstrip()
self.text_buffer += "\n" self.finish_line()
self.has_line_started = False
elif tag == "h1" or tag == "h2": elif tag == "h1" or tag == "h2":
self.text_buffer += "</big>\n" self.current_line += "</big>"
self.has_line_started = False self.finish_line()
elif tag == "h3" or tag == "h4" or tag == "h5": elif tag == "h3" or tag == "h4" or tag == "h5":
self.text_buffer += "</b>\n" self.current_line += "</b>"
self.has_line_started = False self.finish_line()
elif tag == "a": elif tag == "a":
self.text_buffer += "</u></action>" self.current_line += "</u></action>"
elif tag == "code": elif tag == "code":
self.text_buffer += "</code>" self.current_line += "</code>"
elif tag == "b" or tag == "strong": elif tag == "b" or tag == "strong":
self.text_buffer += "</b>" self.current_line += "</b>"
elif tag == "i" or tag == "em": elif tag == "i" or tag == "em":
self.text_buffer += "</i>" self.current_line += "</i>"
elif tag == "u": elif tag == "u":
self.text_buffer += "</u>" self.current_line += "</u>"
elif tag == "li": elif tag == "li":
self.text_buffer += "\n" self.finish_line()
# else: elif tag == "ul":
# print("END", tag, file=sys.stderr) self.indent_level = max(self.indent_level - 1, 0)
def handle_data(self, data): def handle_data(self, data):
print(f"DATA \"{data}\"", file=sys.stderr)
if self.remove_until: if self.remove_until:
return return
if not self.is_preserving: if not self.is_preserving:
data = normalize_whitespace(data) data = normalize_whitespace(data)
if not self.has_line_started: if self.current_line.strip() == "":
data = data.lstrip() data = data.lstrip()
self.text_buffer += data self.current_line += data
self.has_line_started = True
def handle_entityref(self, name): def handle_entityref(self, name):
to_value = { to_value = {
@ -160,17 +167,19 @@ class MinetestHTMLParser(HTMLParser):
} }
if name in to_value: if name in to_value:
self.text_buffer += to_value[name] self.current_line += to_value[name]
else: else:
self.text_buffer += f"&{name};" self.current_line += f"&{name};"
def html_to_minetest(html, formspec_version=6, include_images=True): def html_to_minetest(html, formspec_version=6, include_images=True):
parser = MinetestHTMLParser(include_images) parser = MinetestHTMLParser(include_images)
parser.feed(html) parser.feed(html)
parser.finish_line()
return { return {
"head": HEAD, "head": HEAD,
"body": parser.text_buffer.strip() + "\n\n", "body": parser.completed_text.strip() + "\n",
"links": parser.links, "links": parser.links,
"images": parser.images, "images": parser.images,
"image_tooltips": parser.image_tooltips, "image_tooltips": parser.image_tooltips,