mirror of
https://github.com/minetest/contentdb.git
synced 2024-11-09 17:13:45 +01:00
Implement forum parser to increase accuracy
This commit is contained in:
parent
eb6b1d6375
commit
19e1ed8b32
@ -743,23 +743,25 @@ REPO_BLACKLIST = [".zip", "mediafire.com", "dropbox.com", "weebly.com", \
|
||||
"digitalaudioconcepts.com", "hg.intevation.org", "www.wtfpl.net", \
|
||||
"imageshack.com", "imgur.com"]
|
||||
|
||||
class KrockForumTopic(db.Model):
|
||||
class ForumTopic(db.Model):
|
||||
topic_id = db.Column(db.Integer, primary_key=True, autoincrement=False)
|
||||
author_id = db.Column(db.Integer, db.ForeignKey("user.id"), nullable=False)
|
||||
author = db.relationship("User")
|
||||
|
||||
ttype = db.Column(db.Integer, nullable=False)
|
||||
type = db.Column(db.Enum(PackageType), nullable=False)
|
||||
title = db.Column(db.String(200), nullable=False)
|
||||
name = db.Column(db.String(30), nullable=True)
|
||||
link = db.Column(db.String(200), nullable=True)
|
||||
|
||||
def getType(self):
|
||||
if self.ttype == 1 or self.ttype == 2:
|
||||
return PackageType.MOD
|
||||
elif self.ttype == 6:
|
||||
return PackageType.GAME
|
||||
posts = db.Column(db.Integer, nullable=False)
|
||||
views = db.Column(db.Integer, nullable=False)
|
||||
|
||||
created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow)
|
||||
|
||||
def getRepoURL(self):
|
||||
if self.link is None:
|
||||
return None
|
||||
|
||||
for item in REPO_BLACKLIST:
|
||||
if item in self.link:
|
||||
return None
|
||||
|
@ -15,12 +15,12 @@
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
import flask, json
|
||||
import flask, json, re
|
||||
from flask.ext.sqlalchemy import SQLAlchemy
|
||||
from app import app
|
||||
from app.models import *
|
||||
from app.tasks import celery
|
||||
from .phpbbparser import getProfile
|
||||
from .phpbbparser import getProfile, getTopicsFromForum
|
||||
import urllib.request
|
||||
from urllib.parse import urlparse, quote_plus
|
||||
|
||||
@ -51,71 +51,88 @@ def checkForumAccount(username, token=None):
|
||||
if needsSaving:
|
||||
db.session.commit()
|
||||
|
||||
@celery.task()
|
||||
def importUsersFromModList():
|
||||
|
||||
regex_tag = re.compile(r"\[([a-z0-9_]+)\]")
|
||||
BANNED_NAMES = ["mod", "game", "old", "outdated", "wip", "api", "beta", "alpha", "git"]
|
||||
def getNameFromTaglist(taglist):
|
||||
for tag in reversed(regex_tag.findall(taglist)):
|
||||
if len(tag) < 30 and not tag in BANNED_NAMES and \
|
||||
not re.match(r"^[a-z]?[0-9]+$", tag):
|
||||
return tag
|
||||
|
||||
return None
|
||||
|
||||
regex_title = re.compile(r"^((?:\[[^\]]+\] *)*)([^\[]+) *((?:\[[^\]]+\] *)*)[^\[]*$")
|
||||
def parseTitle(title):
|
||||
m = regex_title.match(title)
|
||||
if m is None:
|
||||
print("Invalid title format: " + title)
|
||||
return title, getNameFromTaglist(title)
|
||||
else:
|
||||
return m.group(2).strip(), getNameFromTaglist(m.group(3))
|
||||
|
||||
def getLinksFromModSearch():
|
||||
links = {}
|
||||
|
||||
contents = urllib.request.urlopen("http://krock-works.16mb.com/MTstuff/modList.php").read().decode("utf-8")
|
||||
list = json.loads(contents)
|
||||
found = {}
|
||||
imported = []
|
||||
for x in json.loads(contents):
|
||||
link = x.get("link")
|
||||
if link is not None:
|
||||
links[int(x["topicId"])] = link
|
||||
|
||||
for user in User.query.all():
|
||||
found[user.username] = True
|
||||
if user.forums_username is not None:
|
||||
found[user.forums_username] = True
|
||||
|
||||
for x in list:
|
||||
author = x.get("author")
|
||||
if author is not None and not author in found:
|
||||
user = User(author)
|
||||
user.forums_username = author
|
||||
imported.append(author)
|
||||
found[author] = True
|
||||
db.session.add(user)
|
||||
|
||||
db.session.commit()
|
||||
for author in found:
|
||||
checkForumAccount.delay(author, None)
|
||||
|
||||
|
||||
BANNED_NAMES = ["mod", "game", "old", "outdated", "wip", "api"]
|
||||
ALLOWED_TYPES = [1, 2, 6]
|
||||
return links
|
||||
|
||||
@celery.task()
|
||||
def importKrocksModList():
|
||||
contents = urllib.request.urlopen("http://krock-works.16mb.com/MTstuff/modList.php").read().decode("utf-8")
|
||||
list = json.loads(contents)
|
||||
def importTopicList():
|
||||
links_by_id = getLinksFromModSearch()
|
||||
|
||||
info_by_id = {}
|
||||
getTopicsFromForum(11, out=info_by_id, extra={ 'type': PackageType.MOD })
|
||||
getTopicsFromForum(15, out=info_by_id, extra={ 'type': PackageType.GAME })
|
||||
|
||||
# Caches
|
||||
username_to_user = {}
|
||||
topics_by_id = {}
|
||||
for topic in ForumTopic.query.all():
|
||||
topics_by_id[topic.topic_id] = topic
|
||||
|
||||
KrockForumTopic.query.delete()
|
||||
# Create or update
|
||||
for info in info_by_id.values():
|
||||
id = int(info["id"])
|
||||
|
||||
for x in list:
|
||||
type = int(x["type"])
|
||||
if not type in ALLOWED_TYPES:
|
||||
continue
|
||||
|
||||
username = x["author"]
|
||||
# Get author
|
||||
username = info["author"]
|
||||
user = username_to_user.get(username)
|
||||
if user is None:
|
||||
user = User.query.filter_by(forums_username=username).first()
|
||||
assert(user is not None)
|
||||
if user is None:
|
||||
print(username + " not found!")
|
||||
user = User(username)
|
||||
user.forums_username = username
|
||||
db.session.add(user)
|
||||
username_to_user[username] = user
|
||||
|
||||
import re
|
||||
tags = re.findall("\[([a-z0-9_]+)\]", x["title"])
|
||||
name = None
|
||||
for tag in reversed(tags):
|
||||
if len(tag) < 30 and not tag in BANNED_NAMES and \
|
||||
not re.match("^([a-z][0-9]+)$", tag):
|
||||
name = tag
|
||||
break
|
||||
# Get / add row
|
||||
topic = topics_by_id.get(id)
|
||||
if topic is None:
|
||||
topic = ForumTopic()
|
||||
db.session.add(topic)
|
||||
|
||||
topic = KrockForumTopic()
|
||||
topic.topic_id = x["topicId"]
|
||||
topic.author_id = user.id
|
||||
topic.ttype = type
|
||||
topic.title = x["title"]
|
||||
topic.name = name
|
||||
topic.link = x.get("link")
|
||||
db.session.add(topic)
|
||||
# Parse title
|
||||
title, name = parseTitle(info["title"])
|
||||
|
||||
# Get link
|
||||
link = links_by_id.get(id)
|
||||
|
||||
# Fill row
|
||||
topic.topic_id = id
|
||||
topic.author = user
|
||||
topic.type = info["type"]
|
||||
topic.title = title
|
||||
topic.name = name
|
||||
topic.link = link
|
||||
topic.posts = info["posts"]
|
||||
topic.views = info["views"]
|
||||
topic.created_at = info["date"]
|
||||
|
||||
db.session.commit()
|
||||
|
@ -5,6 +5,7 @@
|
||||
import urllib, socket
|
||||
from bs4 import *
|
||||
from urllib.parse import urljoin
|
||||
from datetime import datetime
|
||||
import urllib.request
|
||||
import os.path
|
||||
import time, re
|
||||
@ -77,3 +78,72 @@ def getProfile(url, username):
|
||||
__extract_properties(profile, soup)
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
regex_id = re.compile(r"^.*t=([0-9]+).*$")
|
||||
|
||||
def parseForumListPage(id, page, out, extra=None):
|
||||
num_per_page = 30
|
||||
start = page*num_per_page+1
|
||||
print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page))
|
||||
|
||||
url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start)
|
||||
r = urllib.request.urlopen(url).read().decode("utf-8")
|
||||
soup = BeautifulSoup(r, "html.parser")
|
||||
|
||||
for row in soup.find_all("li", class_="row"):
|
||||
classes = row.get("class")
|
||||
if "sticky" in classes or "announce" in classes or "global-announce" in classes:
|
||||
continue
|
||||
|
||||
topic = row.find("dl")
|
||||
|
||||
# Link info
|
||||
link = topic.find(class_="topictitle")
|
||||
id = regex_id.match(link.get("href")).group(1)
|
||||
title = link.find(text=True)
|
||||
|
||||
# Date
|
||||
left = topic.find("dt")
|
||||
date = left.get_text().split("»")[1].strip()
|
||||
date = datetime.strptime(date, "%a %b %d, %Y %H:%M")
|
||||
author = left.find_all("a")[-1].get_text().strip()
|
||||
|
||||
# Get counts
|
||||
posts = topic.find(class_="posts").find(text=True)
|
||||
views = topic.find(class_="views").find(text=True)
|
||||
|
||||
if id in out:
|
||||
print(" - got {} again, title: {}".format(id, title))
|
||||
assert(title == out[id]['title'])
|
||||
return False
|
||||
|
||||
row = {
|
||||
"id" : id,
|
||||
"title" : title,
|
||||
"author": author,
|
||||
"posts" : posts,
|
||||
"views" : views,
|
||||
"date" : date
|
||||
}
|
||||
|
||||
if extra is not None:
|
||||
for key, value in extra.items():
|
||||
row[key] = value
|
||||
|
||||
out[id] = row
|
||||
|
||||
return True
|
||||
|
||||
def getTopicsFromForum(id, out={}, extra=None):
|
||||
print("Fetching all topics from forum {}".format(id))
|
||||
page = 0
|
||||
while parseForumListPage(id, page, out, extra):
|
||||
page = page + 1
|
||||
|
||||
return out
|
||||
|
||||
def dumpTitlesToFile(topics, path):
|
||||
with open(path, "w") as out_file:
|
||||
for topic in topics.values():
|
||||
out_file.write(topic["title"] + "\n")
|
||||
|
@ -17,8 +17,7 @@
|
||||
<form method="post" action="" class="box-body">
|
||||
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||
<select name="action">
|
||||
<option value="importusers">Create users from mod list</option>
|
||||
<option value="importmodlist">Import Krock's mod list</option>
|
||||
<option value="importmodlist">Import forum topics</option>
|
||||
<option value="importscreenshots" selected>Import screenshots from VCS</option>
|
||||
<option value="importdepends">Import dependencies from downloads</option>
|
||||
<option value="modprovides">Set provides to mod name</option>
|
||||
|
@ -11,12 +11,12 @@
|
||||
{% for topic in topics %}
|
||||
<tr>
|
||||
<td>{{ topic.topic_id }}</td>
|
||||
<td>[{{ topic.getType().value }}] <a href="https://forum.minetest.net/viewtopic.php?t={{ topic.topic_id}}">{{ topic.title }}</a></td>
|
||||
<td>[{{ topic.type.value }}] <a href="https://forum.minetest.net/viewtopic.php?t={{ topic.topic_id}}">{{ topic.title }}</a></td>
|
||||
{% if show_author %}
|
||||
<td><a href="{{ url_for('user_profile_page', username=topic.author.username) }}">{{ topic.author.display_name}}</a></td>
|
||||
{% endif %}
|
||||
<td>{{ topic.name or ""}}</td>
|
||||
<td><a href="{{ topic.link }}">{{ topic.link | domain }}</a></td>
|
||||
<td>{% if topic.link %}<a href="{{ topic.link }}">{{ topic.link | domain }}</a>{% endif %}</td>
|
||||
<td>
|
||||
<a href="{{ url_for('create_edit_package_page', author=topic.author.username, repo=topic.getRepoURL(), forums=topic.topic_id, title=topic.title, bname=topic.name) }}">Create</a>
|
||||
</td>
|
||||
|
@ -292,7 +292,7 @@
|
||||
<ul>
|
||||
{% for t in similar_topics %}
|
||||
<li>
|
||||
[{{ t.getType().value }}]
|
||||
[{{ t.type.value }}]
|
||||
<a href="https://forum.minetest.net/viewtopic.php?t={{ t.topic_id }}">
|
||||
{{ t.title }} by {{ t.author.display_name }}
|
||||
</a>
|
||||
|
@ -21,7 +21,7 @@ from flask.ext import menu
|
||||
from app import app
|
||||
from app.models import *
|
||||
from app.tasks.importtasks import importRepoScreenshot, importAllDependencies
|
||||
from app.tasks.forumtasks import importUsersFromModList, importKrocksModList
|
||||
from app.tasks.forumtasks import importTopicList
|
||||
from flask_wtf import FlaskForm
|
||||
from wtforms import *
|
||||
from app.utils import loginUser, rank_required
|
||||
@ -31,11 +31,8 @@ from app.utils import loginUser, rank_required
|
||||
def admin_page():
|
||||
if request.method == "POST":
|
||||
action = request.form["action"]
|
||||
if action == "importusers":
|
||||
task = importUsersFromModList.delay()
|
||||
return redirect(url_for("check_task", id=task.id, r=url_for("user_list_page")))
|
||||
elif action == "importmodlist":
|
||||
task = importKrocksModList.delay()
|
||||
if action == "importmodlist":
|
||||
task = importTopicList.delay()
|
||||
return redirect(url_for("check_task", id=task.id, r=url_for("todo_topics_page")))
|
||||
elif action == "importscreenshots":
|
||||
packages = Package.query \
|
||||
|
@ -100,11 +100,11 @@ def package_page(package):
|
||||
package.checkPerm(current_user, Permission.APPROVE_NEW)
|
||||
|
||||
similar_topics = None if not show_similar_topics else \
|
||||
KrockForumTopic.query \
|
||||
ForumTopic.query \
|
||||
.filter_by(name=package.name) \
|
||||
.filter(KrockForumTopic.topic_id != package.forums) \
|
||||
.filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
|
||||
.order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
|
||||
.filter(ForumTopic.topic_id != package.forums) \
|
||||
.filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
|
||||
.order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
|
||||
.all()
|
||||
|
||||
releases = getReleases(package)
|
||||
|
@ -41,8 +41,8 @@ def todo_page():
|
||||
screenshots = PackageScreenshot.query.filter_by(approved=False).all()
|
||||
|
||||
|
||||
topics_to_add = KrockForumTopic.query \
|
||||
.filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
|
||||
topics_to_add = ForumTopic.query \
|
||||
.filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
|
||||
.count()
|
||||
|
||||
return render_template("todo/list.html", title="Reports and Work Queue",
|
||||
@ -54,11 +54,11 @@ def todo_page():
|
||||
@app.route("/todo/topics/")
|
||||
@login_required
|
||||
def todo_topics_page():
|
||||
total = KrockForumTopic.query.count()
|
||||
total = ForumTopic.query.count()
|
||||
|
||||
topics = KrockForumTopic.query \
|
||||
.filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
|
||||
.order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
|
||||
topics = ForumTopic.query \
|
||||
.filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
|
||||
.order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
|
||||
.all()
|
||||
|
||||
return render_template("todo/topics.html", topics=topics, total=total)
|
||||
|
@ -98,10 +98,10 @@ def user_profile_page(username):
|
||||
|
||||
topics_to_add = None
|
||||
if current_user == user or user.checkPerm(current_user, Permission.CHANGE_AUTHOR):
|
||||
topics_to_add = KrockForumTopic.query \
|
||||
topics_to_add = ForumTopic.query \
|
||||
.filter_by(author_id=user.id) \
|
||||
.filter(~ db.exists().where(Package.forums==KrockForumTopic.topic_id)) \
|
||||
.order_by(db.asc(KrockForumTopic.name), db.asc(KrockForumTopic.title)) \
|
||||
.filter(~ db.exists().where(Package.forums==ForumTopic.topic_id)) \
|
||||
.order_by(db.asc(ForumTopic.name), db.asc(ForumTopic.title)) \
|
||||
.all()
|
||||
|
||||
# Process GET or invalid POST
|
||||
|
55
migrations/versions/9fc23495713b_.py
Normal file
55
migrations/versions/9fc23495713b_.py
Normal file
@ -0,0 +1,55 @@
|
||||
"""empty message
|
||||
|
||||
Revision ID: 9fc23495713b
|
||||
Revises: de004661c5e1
|
||||
Create Date: 2018-07-04 00:03:20.123285
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '9fc23495713b'
|
||||
down_revision = 'de004661c5e1'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
from sqlalchemy.dialects.postgresql import ENUM
|
||||
|
||||
type_enum = ENUM('MOD', 'GAME', 'TXP', name='packagetype', create_type=False)
|
||||
|
||||
def upgrade():
|
||||
type_enum.create(op.get_bind(), checkfirst=True)
|
||||
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_table('krock_forum_topic')
|
||||
op.create_table('forum_topic',
|
||||
sa.Column('topic_id', sa.Integer(), autoincrement=False, nullable=False),
|
||||
sa.Column('author_id', sa.Integer(), nullable=False),
|
||||
sa.Column('type', type_enum, nullable=True),
|
||||
sa.Column('title', sa.String(length=200), nullable=False),
|
||||
sa.Column('name', sa.String(length=30), nullable=True),
|
||||
sa.Column('link', sa.String(length=200), nullable=True),
|
||||
sa.Column('posts', sa.Integer(), nullable=False),
|
||||
sa.Column('views', sa.Integer(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), nullable=False),
|
||||
sa.ForeignKeyConstraint(['author_id'], ['user.id'], ),
|
||||
sa.PrimaryKeyConstraint('topic_id')
|
||||
)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_table('forum_topic')
|
||||
op.create_table('krock_forum_topic',
|
||||
sa.Column('topic_id', sa.Integer(), autoincrement=False, nullable=False),
|
||||
sa.Column('author_id', sa.Integer(), nullable=False),
|
||||
sa.Column('ttype', sa.Integer(), nullable=False),
|
||||
sa.Column('title', sa.String(length=200), nullable=False),
|
||||
sa.Column('name', sa.String(length=30), nullable=True),
|
||||
sa.Column('link', sa.String(length=50), nullable=True),
|
||||
sa.ForeignKeyConstraint(['author_id'], ['user.id'], ),
|
||||
sa.PrimaryKeyConstraint('topic_id')
|
||||
)
|
||||
# ### end Alembic commands ###
|
Loading…
Reference in New Issue
Block a user