Fix various issues with forum topic importing

Fixes #201
This commit is contained in:
rubenwardy 2024-06-22 11:11:57 +01:00
parent 12545c69ac
commit ca961cb35f
5 changed files with 68 additions and 42 deletions

@ -111,9 +111,9 @@ def recalc_scores():
@action("Import forum topic list") @action("Import forum topic list")
def import_topic_list(): def do_import_topic_list():
task = import_topic_list.delay() task = import_topic_list.delay()
return redirect(url_for("tasks.check", id=task.id, r=url_for("todo.topics"))) return redirect(url_for("tasks.check", id=task.id, r=url_for("admin.admin_page")))
@action("Check all forum accounts") @action("Check all forum accounts")

@ -18,15 +18,41 @@ import json
import re import re
import sys import sys
import urllib.request import urllib.request
from typing import Optional
from urllib.parse import urljoin from urllib.parse import urljoin
from sqlalchemy import or_
from app.models import User, db, PackageType, ForumTopic from app.models import User, db, PackageType, ForumTopic
from app.tasks import celery from app.tasks import celery
from app.utils import is_username_valid from app.utils import make_valid_username
from app.utils.phpbbparser import get_profile, get_topics_from_forum from app.utils.phpbbparser import get_profile, get_topics_from_forum
from .usertasks import set_profile_picture_from_url, update_github_user_id_raw from .usertasks import set_profile_picture_from_url, update_github_user_id_raw
def _get_or_create_user(forums_username: str, cache: Optional[dict] = None) -> Optional[User]:
if cache:
user = cache.get(forums_username)
if user:
return user
user = User.query.filter_by(forums_username=forums_username).first()
if user is None:
cdb_username = make_valid_username(forums_username)
user = User.query.filter(or_(User.username == cdb_username, User.forums_username == cdb_username)).first()
if user:
return None
user = User(cdb_username)
user.forums_username = forums_username
user.display_name = forums_username
db.session.add(user)
if cache:
cache[forums_username] = user
return user
@celery.task() @celery.task()
def check_forum_account(forums_username, force_replace_pic=False): def check_forum_account(forums_username, force_replace_pic=False):
print("### Checking " + forums_username, file=sys.stderr) print("### Checking " + forums_username, file=sys.stderr)
@ -39,19 +65,16 @@ def check_forum_account(forums_username, force_replace_pic=False):
if profile is None: if profile is None:
return return
user = User.query.filter_by(forums_username=forums_username).first() user = _get_or_create_user(forums_username)
# Create user
needs_saving = False
if user is None: if user is None:
user = User(forums_username) return
user.forums_username = forums_username
db.session.add(user) needs_saving = False
# Get GitHub username # Get GitHub username
github_username = profile.get("github") github_username = profile.get("github")
if github_username is not None and github_username.strip() != "": if github_username is not None and github_username.strip() != "":
print("Updated GitHub username for " + user.display_name + " to " + github_username) print("Updated GitHub username for " + user.display_name + " to " + github_username, file=sys.stderr)
user.github_username = github_username user.github_username = github_username
update_github_user_id_raw(user) update_github_user_id_raw(user)
needs_saving = True needs_saving = True
@ -104,7 +127,7 @@ regex_title = re.compile(r"^((?:\[[^\]]+\] *)*)([^\[]+) *((?:\[[^\]]+\] *)*)[^\[
def parse_title(title): def parse_title(title):
m = regex_title.match(title) m = regex_title.match(title)
if m is None: if m is None:
print("Invalid title format: " + title) print("Invalid title format: " + title, file=sys.stderr)
return title, get_name_from_taglist(title) return title, get_name_from_taglist(title)
else: else:
return m.group(2).strip(), get_name_from_taglist(m.group(3)) return m.group(2).strip(), get_name_from_taglist(m.group(3))
@ -124,7 +147,7 @@ def get_links_from_mod_search():
pass pass
except urllib.error.URLError: except urllib.error.URLError:
print("Unable to open krocks mod search!") print("Unable to open krocks mod search!", file=sys.stderr)
return links return links
return links return links
@ -135,37 +158,23 @@ def import_topic_list():
links_by_id = get_links_from_mod_search() links_by_id = get_links_from_mod_search()
info_by_id = {} info_by_id = {}
get_topics_from_forum(11, out=info_by_id, extra={'type': PackageType.MOD, 'wip': False})
get_topics_from_forum(9, out=info_by_id, extra={'type': PackageType.MOD, 'wip': True})
get_topics_from_forum(15, out=info_by_id, extra={'type': PackageType.GAME, 'wip': False}) get_topics_from_forum(15, out=info_by_id, extra={'type': PackageType.GAME, 'wip': False})
get_topics_from_forum(50, out=info_by_id, extra={'type': PackageType.GAME, 'wip': True}) get_topics_from_forum(50, out=info_by_id, extra={'type': PackageType.GAME, 'wip': True})
get_topics_from_forum(11, out=info_by_id, extra={'type': PackageType.MOD, 'wip': False})
get_topics_from_forum(9, out=info_by_id, extra={'type': PackageType.MOD, 'wip': True})
get_topics_from_forum(4, out=info_by_id, extra={'type': PackageType.TXP, 'wip': False})
# Caches # Caches
username_to_user = {} username_to_user = {}
topics_by_id = {} topics_by_id = {}
for topic in ForumTopic.query.all(): for topic in ForumTopic.query.all():
if topic.topic_id in info_by_id:
topics_by_id[topic.topic_id] = topic topics_by_id[topic.topic_id] = topic
else:
db.session.delete(topic)
print(f"Deleting topic {topic.topic_id} title {topic.title}", file=sys.stderr)
def get_or_create_user(username): username_conflicts = set()
user = username_to_user.get(username)
if user:
return user
if not is_username_valid(username):
return None
user = User.query.filter_by(forums_username=username).first()
if user is None:
user = User.query.filter_by(username=username).first()
if user:
return None
user = User(username)
user.forums_username = username
db.session.add(user)
username_to_user[username] = user
return user
# Create or update # Create or update
for info in info_by_id.values(): for info in info_by_id.values():
@ -173,9 +182,9 @@ def import_topic_list():
# Get author # Get author
username = info["author"] username = info["author"]
user = get_or_create_user(username) user = _get_or_create_user(username, username_to_user)
if user is None: if user is None:
print("Error! Unable to create user {}".format(username), file=sys.stderr) username_conflicts.add(username)
continue continue
# Get / add row # Get / add row
@ -203,3 +212,6 @@ def import_topic_list():
topic.created_at = info["date"] topic.created_at = info["date"]
db.session.commit() db.session.commit()
if len(username_conflicts) > 0:
print("The following forum usernames could not be created: " + (", ".join(username_conflicts)))

@ -16,6 +16,15 @@
import user_agents import user_agents
from app.utils import make_valid_username
def test_make_valid_username():
assert make_valid_username("rubenwardy") == "rubenwardy"
assert make_valid_username("Test123._-") == "Test123._-"
assert make_valid_username("Foo Bar") == "Foo_Bar"
assert make_valid_username("François") == "Fran_ois"
def test_web_is_not_bot(): def test_web_is_not_bot():
assert not user_agents.parse("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0").is_bot assert not user_agents.parse("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0").is_bot

@ -28,11 +28,15 @@ from .user import *
YESES = ["yes", "true", "1", "on"] YESES = ["yes", "true", "1", "on"]
def is_username_valid(username): def is_username_valid(username: str) -> bool:
return username is not None and len(username) >= 2 and \ return username is not None and len(username) >= 2 and \
re.match(r"^[A-Za-z0-9._-]*$", username) and not re.match(r"^\.*$", username) re.match(r"^[A-Za-z0-9._-]*$", username) and not re.match(r"^\.*$", username)
def make_valid_username(username: str) -> str:
return re.sub(r"[^A-Za-z0-9._-]+", "_", username)
def is_yes(val): def is_yes(val):
return val and val.lower() in YESES return val and val.lower() in YESES

@ -3,6 +3,7 @@
# Source: https://github.com/rubenwardy/python_phpbb_parser # Source: https://github.com/rubenwardy/python_phpbb_parser
import re import re
import sys
import urllib import urllib
import urllib.parse as urlparse import urllib.parse as urlparse
import urllib.request import urllib.request
@ -121,7 +122,7 @@ regex_id = re.compile(r"^.*t=([0-9]+).*$")
def parse_forum_list_page(id, page, out, extra=None): def parse_forum_list_page(id, page, out, extra=None):
num_per_page = 30 num_per_page = 30
start = page*num_per_page+1 start = page*num_per_page+1
print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page)) print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page), file=sys.stderr)
url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start) url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
r = urllib.request.urlopen(url).read().decode("utf-8") r = urllib.request.urlopen(url).read().decode("utf-8")
@ -154,7 +155,7 @@ def parse_forum_list_page(id, page, out, extra=None):
views = topic.find(class_="views").find(text=True) views = topic.find(class_="views").find(text=True)
if id in out: if id in out:
print(" - got {} again, title: {}".format(id, title)) print(" - got {} again, title: {}".format(id, title), file=sys.stderr)
assert title == out[id]['title'] assert title == out[id]['title']
return False return False
@ -177,7 +178,7 @@ def parse_forum_list_page(id, page, out, extra=None):
def get_topics_from_forum(id, out, extra=None): def get_topics_from_forum(id, out, extra=None):
print("Fetching all topics from forum {}".format(id)) print("Fetching all topics from forum {}".format(id), file=sys.stderr)
page = 0 page = 0
while parse_forum_list_page(id, page, out, extra): while parse_forum_list_page(id, page, out, extra):
page = page + 1 page = page + 1