contentdb/app/tasks/pkgtasks.py
rubenwardy 837d0b5bc1 Link Checker: Allow 403 status codes
Cloudflare likes to break the Internet, so we'll have to ignore
403 errors from sites in the link checker.
2024-09-05 19:19:16 +01:00

212 lines
6.8 KiB
Python

# ContentDB
# Copyright (C) 2018-21 rubenwardy
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import datetime
import random
import re
import sys
from time import sleep
from urllib.parse import urlparse
from typing import Optional
import requests
import urllib3
from app import app
from sqlalchemy import or_, and_
from app.markdown import get_links, render_markdown
from app.models import Package, db, PackageState, AuditLogEntry, AuditSeverity
from app.tasks import celery, TaskError
from app.utils import post_bot_message, post_to_approval_thread, get_system_user, add_audit_log
@celery.task()
def update_package_scores():
Package.query.update({ "score_downloads": Package.score_downloads * 0.93 })
db.session.commit()
for package in Package.query.all():
package.recalculate_score()
db.session.commit()
def desc_contains(desc: str, search_str: str):
if search_str.startswith("https://forum.minetest.net/viewtopic.php?%t="):
reg = re.compile(search_str.replace(".", "\\.").replace("/", "\\/").replace("?", "\\?").replace("%", ".*"))
return reg.search(desc)
else:
return search_str in desc
@celery.task()
def notify_about_git_forum_links():
package_links = [(x[0], x[1]) for x in db.session.query(Package, Package.repo)
.filter(Package.repo.is_not(None), Package.state == PackageState.APPROVED).all()]
for pair in db.session.query(Package, Package.forums) \
.filter(Package.forums.is_not(None), Package.state == PackageState.APPROVED).all():
package_links.append((pair[0], f"https://forum.minetest.net/viewtopic.php?%t={pair[1]}"))
clauses = [and_(Package.id != pair[0].id, Package.desc.ilike(f"%{pair[1]}%")) for pair in package_links]
packages = Package.query.filter(Package.desc != "", Package.desc.is_not(None), Package.state == PackageState.APPROVED, or_(*clauses)).all()
for package in packages:
links = []
for (link_package, link) in package_links:
if link_package != package and desc_contains(package.desc.lower(), link.lower()):
links.append((link_package, link))
if len(links) > 0:
title = "You should link to ContentDB pages instead of repos/forum topics"
msg = "You should update your long description to link to ContentDB pages instead of repositories or " \
"forum topics, where possible. \n" \
"You should also remove lists of dependencies, as CDB already shows that.\n\n" \
"There's a ContentDB dialog redesign coming to Minetest 5.9.0. " \
"Clicking a ContentDB link stays inside Minetest but an external repository / forums " \
"link will open a web browser. Therefore, linking to ContentDB pages when referring to a " \
"package will improve the user experience.\n\nHere are some URLs you might wish to replace:\n"
for x in links:
msg += f"\n* {x[1].replace('%', '')} -> {x[0].get_url('packages.view', absolute=True)}"
post_bot_message(package, title, msg)
db.session.commit()
@celery.task()
def clear_removed_packages(all_packages: bool):
if all_packages:
query = Package.query.filter_by(state=PackageState.DELETED)
else:
one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
query = Package.query.filter(
Package.state == PackageState.DELETED,
Package.downloads < 1000,
~Package.audit_log_entries.any(AuditLogEntry.created_at > one_year_ago))
count = query.count()
for pkg in query.all():
pkg.review_thread = None
db.session.delete(pkg)
db.session.commit()
return f"Deleted {count} soft deleted packages packages"
def _url_exists(url: str) -> str:
try:
headers = {
"User-Agent": "Mozilla/5.0 (compatible; ContentDB link checker; +https://content.minetest.net/)",
}
with requests.get(url, stream=True, headers=headers, timeout=10) as response:
response.raise_for_status()
return ""
except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
return ""
print(f" - [{e.response.status_code}] <{url}>", file=sys.stderr)
return str(e.response.status_code)
except requests.exceptions.ConnectionError:
return "ConnectionError"
except urllib3.exceptions.ReadTimeoutError:
return "timeout"
def _check_for_dead_links(package: Package) -> dict[str, str]:
ignored_urls = set(app.config.get("LINK_CHECKER_IGNORED_URLS", ""))
links: set[Optional[str]] = {
package.repo,
package.website,
package.issueTracker,
package.forums_url,
package.video_url,
package.donate_url_actual,
package.translation_url,
}
if package.desc:
links.update(get_links(render_markdown(package.desc), package.get_url("packages.view", absolute=True)))
print(f"Checking {package.title} ({len(links)} links) for broken links", file=sys.stderr)
bad_urls = {}
for link in links:
if link is None:
continue
url = urlparse(link)
if url.scheme != "http" and url.scheme != "https":
continue
if url.hostname in ignored_urls:
continue
res = _url_exists(link)
if res != "":
bad_urls[link] = res
# Prevent leaking information
sleep(random.uniform(0.4, 0.6))
return bad_urls
def _check_package(package: Package) -> Optional[str]:
bad_urls = _check_for_dead_links(package)
if len(bad_urls) > 0:
return ("The following broken links were found on your package:\n\n" +
"\n".join([f"- <{link}> [{res}]" for link, res in bad_urls.items()]))
return None
@celery.task()
def check_package_on_submit(package_id: int):
package = Package.query.get(package_id)
if package is None:
raise TaskError("No such package")
if package.state != PackageState.READY_FOR_REVIEW:
return
msg = _check_package(package)
if msg:
marked = f"Marked {package.title} as {PackageState.CHANGES_NEEDED.value}"
system_user = get_system_user()
post_to_approval_thread(package, system_user, marked, is_status_update=True, create_thread=True)
post_to_approval_thread(package, system_user, msg, is_status_update=False, create_thread=True)
package.state = PackageState.CHANGES_NEEDED
add_audit_log(AuditSeverity.EDITOR, system_user, marked, package.get_url("packages.view"), package)
db.session.commit()
@celery.task(rate_limit="5/m")
def check_package_for_broken_links(package_id: int):
package = Package.query.get(package_id)
if package is None:
raise TaskError("No such package")
msg = _check_package(package)
if msg:
post_bot_message(package, "Broken links", msg)
db.session.commit()