contentdb/app/utils/phpbbparser.py

187 lines
4.2 KiB
Python
Raw Normal View History

2018-05-17 16:18:20 +02:00
# Copyright (c) 2016 Andrew "rubenwardy" Ward
# License: MIT
# Source: https://github.com/rubenwardy/python_phpbb_parser
2020-12-04 03:23:04 +01:00
import re
import sys
2020-12-04 03:23:04 +01:00
import urllib
import urllib.parse as urlparse
2020-12-04 03:23:04 +01:00
import urllib.request
from datetime import datetime
from urllib.parse import urlencode
2023-06-19 20:32:36 +02:00
from bs4 import BeautifulSoup
2020-12-04 03:23:04 +01:00
2023-06-19 22:27:49 +02:00
def url_encode_non_ascii(b):
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
2018-05-14 00:31:42 +02:00
2023-06-19 22:27:49 +02:00
2018-05-14 00:31:42 +02:00
class Profile:
def __init__(self, username):
self.username = username
self.signature = ""
self.avatar = None
2018-05-14 00:31:42 +02:00
self.properties = {}
def set(self, key, value):
self.properties[key.lower()] = value
2018-05-14 00:31:42 +02:00
def get(self, key):
return self.properties.get(key.lower())
2018-05-14 00:31:42 +02:00
def __str__(self):
return self.username + "\n" + str(self.signature) + "\n" + str(self.properties)
2023-06-19 22:27:49 +02:00
2018-05-14 00:31:42 +02:00
def __extract_properties(profile, soup):
el = soup.find(id="viewprofile")
if el is None:
return None
res1 = el.find_all("dl")
imgs = res1[0].find_all("img")
if len(imgs) == 1:
profile.avatar = imgs[0]["src"]
res = el.select("dl.left-box.details")
2018-05-14 00:31:42 +02:00
if len(res) != 1:
return None
catch_next_key = None
# Look through
for element in res[0].children:
if element.name == "dt":
if catch_next_key is None:
catch_next_key = element.text.lower()[:-1].strip()
else:
print("Unexpected dt!")
elif element.name == "dd":
if catch_next_key is None:
print("Unexpected dd!")
else:
if catch_next_key != "groups":
profile.set(catch_next_key, element.text)
catch_next_key = None
elif element and element.name is not None:
print("Unexpected other")
2023-06-19 22:27:49 +02:00
2018-05-14 00:31:42 +02:00
def __extract_signature(soup):
res = soup.find_all("div", class_="signature")
2020-12-04 03:23:04 +01:00
if len(res) != 1:
2018-05-14 00:31:42 +02:00
return None
else:
2020-12-05 01:27:14 +01:00
return str(res[0])
2018-05-14 00:31:42 +02:00
2023-06-19 22:27:49 +02:00
def get_profile_url(url, username):
url = urlparse.urlparse(url)
# Update path
url = url._replace(path="/memberlist.php")
# Set query args
query = dict(urlparse.parse_qsl(url.query))
query.update({ "un": username, "mode": "viewprofile" })
query_encoded = urlencode(query)
url = url._replace(query=query_encoded)
return urlparse.urlunparse(url)
2023-06-19 22:27:49 +02:00
def get_profile(url, username):
url = get_profile_url(url, username)
2018-05-14 00:31:42 +02:00
2020-12-22 11:58:43 +01:00
try:
req = urllib.request.urlopen(url, timeout=15)
2020-12-22 11:58:43 +01:00
except urllib.error.HTTPError as e:
if e.code == 404:
return None
2020-12-22 11:58:43 +01:00
raise IOError(e)
contents = req.read().decode("utf-8")
2018-05-14 00:31:42 +02:00
soup = BeautifulSoup(contents, "lxml")
if soup is None:
return None
profile = Profile(username)
profile.signature = __extract_signature(soup)
__extract_properties(profile, soup)
return profile
regex_id = re.compile(r"^.*t=([0-9]+).*$")
2023-06-19 22:27:49 +02:00
def parse_forum_list_page(id, page, out, extra=None):
num_per_page = 30
start = page*num_per_page+1
print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page), file=sys.stderr)
url = "https://forum.luanti.org/viewforum.php?f=" + str(id) + "&start=" + str(start)
r = urllib.request.urlopen(url).read().decode("utf-8")
soup = BeautifulSoup(r, "html.parser")
for row in soup.find_all("li", class_="row"):
classes = row.get("class")
if "sticky" in classes or "announce" in classes or "global-announce" in classes:
continue
topic = row.find("dl")
# Link info
link = topic.find(class_="topictitle")
id = regex_id.match(link.get("href")).group(1)
title = link.find(text=True)
# Date
2020-12-04 03:57:36 +01:00
left = topic.find(class_="topic-poster")
date = left.find("time").get_text()
date = datetime.strptime(date, "%a %b %d, %Y %H:%M")
2022-11-09 18:41:32 +01:00
links = left.find_all("a")
if len(links) == 0:
continue
author = links[-1].get_text().strip()
# Get counts
posts = topic.find(class_="posts").find(text=True)
views = topic.find(class_="views").find(text=True)
if id in out:
print(" - got {} again, title: {}".format(id, title), file=sys.stderr)
assert title == out[id]['title']
return False
row = {
"id" : id,
"title" : title,
"author": author,
"posts" : posts,
"views" : views,
"date" : date
}
if extra is not None:
for key, value in extra.items():
row[key] = value
out[id] = row
return True
2023-06-19 22:27:49 +02:00
def get_topics_from_forum(id, out, extra=None):
print("Fetching all topics from forum {}".format(id), file=sys.stderr)
page = 0
2023-06-19 22:27:49 +02:00
while parse_forum_list_page(id, page, out, extra):
page = page + 1
return out