2018-05-17 16:18:20 +02:00
|
|
|
# Copyright (c) 2016 Andrew "rubenwardy" Ward
|
|
|
|
# License: MIT
|
|
|
|
# Source: https://github.com/rubenwardy/python_phpbb_parser
|
|
|
|
|
2020-12-04 03:23:04 +01:00
|
|
|
import re
|
|
|
|
import urllib
|
2020-07-10 23:44:58 +02:00
|
|
|
import urllib.parse as urlparse
|
2020-12-04 03:23:04 +01:00
|
|
|
import urllib.request
|
|
|
|
from datetime import datetime
|
2020-07-10 23:44:58 +02:00
|
|
|
from urllib.parse import urlencode
|
2020-12-04 03:23:04 +01:00
|
|
|
from bs4 import *
|
|
|
|
|
2018-05-15 16:00:12 +02:00
|
|
|
|
|
|
|
def urlEncodeNonAscii(b):
|
|
|
|
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
|
2018-05-14 00:31:42 +02:00
|
|
|
|
|
|
|
class Profile:
|
|
|
|
def __init__(self, username):
|
2018-12-25 20:28:32 +01:00
|
|
|
self.username = username
|
|
|
|
self.signature = ""
|
|
|
|
self.avatar = None
|
2018-05-14 00:31:42 +02:00
|
|
|
self.properties = {}
|
|
|
|
|
|
|
|
def set(self, key, value):
|
2020-07-10 19:41:08 +02:00
|
|
|
self.properties[key.lower()] = value
|
2018-05-14 00:31:42 +02:00
|
|
|
|
|
|
|
def get(self, key):
|
2020-07-10 19:41:08 +02:00
|
|
|
return self.properties.get(key.lower())
|
2018-05-14 00:31:42 +02:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return self.username + "\n" + str(self.signature) + "\n" + str(self.properties)
|
|
|
|
|
|
|
|
def __extract_properties(profile, soup):
|
|
|
|
el = soup.find(id="viewprofile")
|
|
|
|
if el is None:
|
|
|
|
return None
|
|
|
|
|
2018-12-25 20:28:32 +01:00
|
|
|
res1 = el.find_all("dl")
|
|
|
|
imgs = res1[0].find_all("img")
|
|
|
|
if len(imgs) == 1:
|
|
|
|
profile.avatar = imgs[0]["src"]
|
|
|
|
|
2020-07-10 19:41:08 +02:00
|
|
|
res = el.select("dl.left-box.details")
|
2018-05-14 00:31:42 +02:00
|
|
|
if len(res) != 1:
|
|
|
|
return None
|
|
|
|
|
|
|
|
catch_next_key = None
|
|
|
|
|
|
|
|
# Look through
|
|
|
|
for element in res[0].children:
|
|
|
|
if element.name == "dt":
|
|
|
|
if catch_next_key is None:
|
|
|
|
catch_next_key = element.text.lower()[:-1].strip()
|
|
|
|
else:
|
|
|
|
print("Unexpected dt!")
|
|
|
|
|
|
|
|
elif element.name == "dd":
|
|
|
|
if catch_next_key is None:
|
|
|
|
print("Unexpected dd!")
|
|
|
|
else:
|
|
|
|
if catch_next_key != "groups":
|
|
|
|
profile.set(catch_next_key, element.text)
|
|
|
|
catch_next_key = None
|
|
|
|
|
|
|
|
elif element and element.name is not None:
|
|
|
|
print("Unexpected other")
|
|
|
|
|
|
|
|
def __extract_signature(soup):
|
|
|
|
res = soup.find_all("div", class_="signature")
|
2020-12-04 03:23:04 +01:00
|
|
|
if len(res) != 1:
|
2018-05-14 00:31:42 +02:00
|
|
|
return None
|
|
|
|
else:
|
2020-12-05 01:27:14 +01:00
|
|
|
return str(res[0])
|
2018-05-14 00:31:42 +02:00
|
|
|
|
2020-07-10 23:44:58 +02:00
|
|
|
|
|
|
|
def getProfileURL(url, username):
|
|
|
|
url = urlparse.urlparse(url)
|
|
|
|
|
|
|
|
# Update path
|
|
|
|
url = url._replace(path="/memberlist.php")
|
|
|
|
|
|
|
|
# Set query args
|
|
|
|
query = dict(urlparse.parse_qsl(url.query))
|
|
|
|
query.update({ "un": username, "mode": "viewprofile" })
|
|
|
|
query_encoded = urlencode(query)
|
|
|
|
url = url._replace(query=query_encoded)
|
|
|
|
|
|
|
|
return urlparse.urlunparse(url)
|
|
|
|
|
|
|
|
|
2018-05-14 00:31:42 +02:00
|
|
|
def getProfile(url, username):
|
2020-07-10 23:44:58 +02:00
|
|
|
url = getProfileURL(url, username)
|
2018-05-14 00:31:42 +02:00
|
|
|
|
2020-12-22 11:58:43 +01:00
|
|
|
try:
|
|
|
|
req = urllib.request.urlopen(url, timeout=5)
|
|
|
|
except urllib.error.HTTPError as e:
|
|
|
|
if e.code == 404:
|
|
|
|
return None
|
2020-04-14 15:39:49 +02:00
|
|
|
|
2020-12-22 11:58:43 +01:00
|
|
|
raise IOError(e)
|
2020-04-14 15:39:49 +02:00
|
|
|
|
|
|
|
contents = req.read().decode("utf-8")
|
2018-05-14 00:31:42 +02:00
|
|
|
soup = BeautifulSoup(contents, "lxml")
|
|
|
|
if soup is None:
|
|
|
|
return None
|
|
|
|
|
2020-07-10 19:41:08 +02:00
|
|
|
profile = Profile(username)
|
|
|
|
profile.signature = __extract_signature(soup)
|
|
|
|
__extract_properties(profile, soup)
|
|
|
|
|
|
|
|
return profile
|
2018-07-04 01:14:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
regex_id = re.compile(r"^.*t=([0-9]+).*$")
|
|
|
|
|
|
|
|
def parseForumListPage(id, page, out, extra=None):
|
|
|
|
num_per_page = 30
|
|
|
|
start = page*num_per_page+1
|
|
|
|
print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))
|
|
|
|
|
|
|
|
url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
|
|
|
|
r = urllib.request.urlopen(url).read().decode("utf-8")
|
|
|
|
soup = BeautifulSoup(r, "html.parser")
|
|
|
|
|
|
|
|
for row in soup.find_all("li", class_="row"):
|
|
|
|
classes = row.get("class")
|
|
|
|
if "sticky" in classes or "announce" in classes or "global-announce" in classes:
|
|
|
|
continue
|
|
|
|
|
|
|
|
topic = row.find("dl")
|
|
|
|
|
|
|
|
# Link info
|
|
|
|
link = topic.find(class_="topictitle")
|
|
|
|
id = regex_id.match(link.get("href")).group(1)
|
|
|
|
title = link.find(text=True)
|
|
|
|
|
|
|
|
# Date
|
2020-12-04 03:57:36 +01:00
|
|
|
left = topic.find(class_="topic-poster")
|
|
|
|
date = left.find("time").get_text()
|
2018-07-04 01:14:37 +02:00
|
|
|
date = datetime.strptime(date, "%a %b %d, %Y %H:%M")
|
2022-11-09 18:41:32 +01:00
|
|
|
links = left.find_all("a")
|
|
|
|
if len(links) == 0:
|
|
|
|
continue
|
|
|
|
|
|
|
|
author = links[-1].get_text().strip()
|
2018-07-04 01:14:37 +02:00
|
|
|
|
|
|
|
# Get counts
|
|
|
|
posts = topic.find(class_="posts").find(text=True)
|
|
|
|
views = topic.find(class_="views").find(text=True)
|
|
|
|
|
|
|
|
if id in out:
|
|
|
|
print(" - got {} again, title: {}".format(id, title))
|
2020-01-18 02:38:00 +01:00
|
|
|
assert title == out[id]['title']
|
2018-07-04 01:14:37 +02:00
|
|
|
return False
|
|
|
|
|
|
|
|
row = {
|
|
|
|
"id" : id,
|
|
|
|
"title" : title,
|
|
|
|
"author": author,
|
|
|
|
"posts" : posts,
|
|
|
|
"views" : views,
|
|
|
|
"date" : date
|
|
|
|
}
|
|
|
|
|
|
|
|
if extra is not None:
|
|
|
|
for key, value in extra.items():
|
|
|
|
row[key] = value
|
|
|
|
|
|
|
|
out[id] = row
|
|
|
|
|
2018-08-25 22:24:59 +02:00
|
|
|
return True
|
2018-07-04 01:14:37 +02:00
|
|
|
|
2020-12-04 03:23:04 +01:00
|
|
|
def getTopicsFromForum(id, out, extra=None):
|
2018-07-04 01:14:37 +02:00
|
|
|
print("Fetching all topics from forum {}".format(id))
|
|
|
|
page = 0
|
|
|
|
while parseForumListPage(id, page, out, extra):
|
|
|
|
page = page + 1
|
|
|
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
def dumpTitlesToFile(topics, path):
|
|
|
|
with open(path, "w") as out_file:
|
|
|
|
for topic in topics.values():
|
|
|
|
out_file.write(topic["title"] + "\n")
|