mirror of
https://github.com/minetest/contentdb.git
synced 2025-01-24 23:11:33 +01:00
150 lines
3.8 KiB
Python
150 lines
3.8 KiB
Python
|
# ContentDB
|
||
|
# Copyright (C) 2022 rubenwardy
|
||
|
#
|
||
|
# This program is free software: you can redistribute it and/or modify
|
||
|
# it under the terms of the GNU Affero General Public License as published by
|
||
|
# the Free Software Foundation, either version 3 of the License, or
|
||
|
# (at your option) any later version.
|
||
|
#
|
||
|
# This program is distributed in the hope that it will be useful,
|
||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
# GNU Affero General Public License for more details.
|
||
|
#
|
||
|
# You should have received a copy of the GNU Affero General Public License
|
||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
|
|
||
|
import datetime
|
||
|
import inspect
|
||
|
import os
|
||
|
import re
|
||
|
import sys
|
||
|
import gzip
|
||
|
import user_agents
|
||
|
from urllib.parse import urlparse, parse_qs, unquote
|
||
|
|
||
|
if not "FLASK_CONFIG" in os.environ:
|
||
|
os.environ["FLASK_CONFIG"] = "../config.cfg"
|
||
|
|
||
|
logs_dir = sys.argv[1].strip()
|
||
|
if not os.path.isdir(logs_dir):
|
||
|
sys.exit(1)
|
||
|
|
||
|
# Allow finding the `app` module
|
||
|
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
|
||
|
parentdir = os.path.dirname(currentdir)
|
||
|
sys.path.insert(0,parentdir)
|
||
|
|
||
|
from app.models import db, Package, PackageDailyStats
|
||
|
|
||
|
url_re = re.compile(r"\/packages\/([^\/]+)\/([^\/]+)\/releases\/[0-9]+\/download\/[^ ]*")
|
||
|
dt_re = re.compile(r"\[(\d+\/\w+\/[\d: +-]+)\]")
|
||
|
ua_re = re.compile(r"\"([^\"]+)\"$")
|
||
|
|
||
|
row_lookup = {}
|
||
|
package_id_by_key = {}
|
||
|
for package in Package.query.all():
|
||
|
package_id_by_key[f"{package.author.username}/{package.name}".lower()] = package.id
|
||
|
|
||
|
log_files = [f for f in os.listdir(logs_dir) if os.path.isfile(os.path.join(logs_dir, f))]
|
||
|
|
||
|
ua_is_bot = {}
|
||
|
|
||
|
|
||
|
def my_open(path):
|
||
|
if path.endswith(".gz"):
|
||
|
return gzip.open(path, "rt")
|
||
|
else:
|
||
|
return open(path, "r")
|
||
|
|
||
|
|
||
|
for log_file in log_files:
|
||
|
print(f"Importing from {log_file}")
|
||
|
with my_open(os.path.join(logs_dir, log_file)) as infile:
|
||
|
line_no = 1
|
||
|
for line in infile:
|
||
|
if "/download/" not in line:
|
||
|
continue
|
||
|
|
||
|
line_no += 1
|
||
|
|
||
|
url_match = url_re.search(line)
|
||
|
if url_match is None:
|
||
|
continue
|
||
|
|
||
|
url = url_match.group(0)
|
||
|
author = unquote(url_match.group(1))
|
||
|
name = url_match.group(2)
|
||
|
|
||
|
parsed_url = urlparse(url)
|
||
|
reason = parse_qs(parsed_url.query).get("reason")
|
||
|
if reason:
|
||
|
reason = reason[0]
|
||
|
|
||
|
dt_match = dt_re.search(line)
|
||
|
assert dt_match
|
||
|
dt = datetime.datetime.strptime(dt_match.group(1), "%d/%b/%Y:%H:%M:%S %z")
|
||
|
dt = datetime.datetime.utcfromtimestamp(dt.timestamp())
|
||
|
date = dt.date()
|
||
|
|
||
|
if date >= datetime.date(2022, 11, 6):
|
||
|
continue
|
||
|
|
||
|
# print(line)
|
||
|
|
||
|
ua_match = ua_re.search(line)
|
||
|
if ua_match is None:
|
||
|
print("No UA: " + line)
|
||
|
continue
|
||
|
|
||
|
ua = ua_match.group(1)
|
||
|
is_bot = ua_is_bot.get(ua)
|
||
|
if is_bot:
|
||
|
continue
|
||
|
if is_bot is None:
|
||
|
user_agent = user_agents.parse(ua)
|
||
|
ua_is_bot[ua] = user_agent.is_bot
|
||
|
if user_agent.is_bot:
|
||
|
continue
|
||
|
|
||
|
package_key = f"{author}/{name}".lower()
|
||
|
package_id = package_id_by_key.get(package_key)
|
||
|
if package_id is None:
|
||
|
print(f"Package not found: {package_key}")
|
||
|
continue
|
||
|
|
||
|
key = f"{date.isoformat()}/{package_id}"
|
||
|
# print(author, name, reason, ua, date, key)
|
||
|
|
||
|
row = row_lookup.get(key)
|
||
|
if not row:
|
||
|
row = PackageDailyStats()
|
||
|
row.date = date
|
||
|
row.package_id = package_id
|
||
|
|
||
|
row.platform_minetest = 0
|
||
|
row.platform_other = 0
|
||
|
row.reason_new = 0
|
||
|
row.reason_dependency = 0
|
||
|
row.reason_update = 0
|
||
|
|
||
|
db.session.add(row)
|
||
|
row_lookup[key] = row
|
||
|
|
||
|
if ua.startswith("Minetest/"):
|
||
|
row.platform_minetest += 1
|
||
|
else:
|
||
|
row.platform_other += 1
|
||
|
|
||
|
if reason == "new":
|
||
|
row.reason_new += 1
|
||
|
elif reason == "dependency":
|
||
|
row.reason_dependency += 1
|
||
|
elif reason == "update":
|
||
|
row.reason_update += 1
|
||
|
|
||
|
# if line_no > 1000:
|
||
|
# break
|
||
|
|
||
|
db.session.commit()
|