contentdb/utils/import_nginx_logs.py

# ContentDB
# Copyright (C) 2022 rubenwardy
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

import datetime
import inspect
import os
import re
import sys
import gzip
import user_agents
from urllib.parse import urlparse, parse_qs, unquote

if not "FLASK_CONFIG" in os.environ:
	os.environ["FLASK_CONFIG"] = "../config.cfg"

logs_dir = sys.argv[1].strip()
if not os.path.isdir(logs_dir):
	sys.exit(1)

# Allow finding the `app` module
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from app.models import db, Package, PackageDailyStats

url_re = re.compile(r"\/packages\/([^\/]+)\/([^\/]+)\/releases\/[0-9]+\/download\/[^ ]*")
dt_re = re.compile(r"\[(\d+\/\w+\/[\d: +-]+)\]")
ua_re = re.compile(r"\"([^\"]+)\"$")

row_lookup = {}
package_id_by_key = {}
for package in Package.query.all():
	package_id_by_key[f"{package.author.username}/{package.name}".lower()] = package.id

log_files = [f for f in os.listdir(logs_dir) if os.path.isfile(os.path.join(logs_dir, f))]

ua_is_bot = {}


def my_open(path):
	if path.endswith(".gz"):
		return gzip.open(path, "rt")
	else:
		return open(path, "r")


for log_file in log_files:
	print(f"Importing from {log_file}")
	with my_open(os.path.join(logs_dir, log_file)) as infile:
		line_no = 1
		for line in infile:
			if "/download/" not in line:
				continue

			line_no += 1

			url_match = url_re.search(line)
			if url_match is None:
				continue

			url = url_match.group(0)
			author = unquote(url_match.group(1))
			name = url_match.group(2)

			parsed_url = urlparse(url)
			reason = parse_qs(parsed_url.query).get("reason")
			if reason:
				reason = reason[0]

			dt_match = dt_re.search(line)
			assert dt_match
			dt = datetime.datetime.strptime(dt_match.group(1), "%d/%b/%Y:%H:%M:%S %z")
			dt = datetime.datetime.utcfromtimestamp(dt.timestamp())
			date = dt.date()

			if date >= datetime.date(2022, 11, 6):
				continue

			# print(line)

			ua_match = ua_re.search(line)
			if ua_match is None:
				print("No UA: " + line)
				continue

			ua = ua_match.group(1)
			is_bot = ua_is_bot.get(ua)
			if is_bot:
				continue
			if is_bot is None:
				user_agent = user_agents.parse(ua)
				ua_is_bot[ua] = user_agent.is_bot
				if user_agent.is_bot:
					continue

			package_key = f"{author}/{name}".lower()
			package_id = package_id_by_key.get(package_key)
			if package_id is None:
				print(f"Package not found: {package_key}")
				continue

			key = f"{date.isoformat()}/{package_id}"
			# print(author, name, reason, ua, date, key)

			row = row_lookup.get(key)
			if not row:
				row = PackageDailyStats()
				row.date = date
				row.package_id = package_id

				row.platform_minetest = 0
				row.platform_other = 0
				row.reason_new = 0
				row.reason_dependency = 0
				row.reason_update = 0

				db.session.add(row)
				row_lookup[key] = row

			if ua.startswith("Minetest/") or ua.startswith("Luanti/"):
				row.platform_minetest += 1
			else:
				row.platform_other += 1

			if reason == "new":
				row.reason_new += 1
			elif reason == "dependency":
				row.reason_dependency += 1
			elif reason == "update":
				row.reason_update += 1

			# if line_no > 1000:
			# 	break

db.session.commit()
Add script to import stats from nginx logs 2022-11-06 17:54:06 +01:00			`# ContentDB`
			`# Copyright (C) 2022 rubenwardy`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU Affero General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Affero General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Affero General Public License`
			`# along with this program. If not, see <https://www.gnu.org/licenses/>.`

			`import datetime`
			`import inspect`
			`import os`
			`import re`
			`import sys`
			`import gzip`
			`import user_agents`
			`from urllib.parse import urlparse, parse_qs, unquote`

			`if not "FLASK_CONFIG" in os.environ:`
			`os.environ["FLASK_CONFIG"] = "../config.cfg"`

			`logs_dir = sys.argv[1].strip()`
			`if not os.path.isdir(logs_dir):`
			`sys.exit(1)`

			# Allow finding the `app` module
			`currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))`
			`parentdir = os.path.dirname(currentdir)`
			`sys.path.insert(0,parentdir)`

			`from app.models import db, Package, PackageDailyStats`

			`url_re = re.compile(r"\/packages\/([^\/]+)\/([^\/]+)\/releases\/[0-9]+\/download\/[^ ]*")`
			`dt_re = re.compile(r"\[(\d+\/\w+\/[\d: +-]+)\]")`
			`ua_re = re.compile(r"\"([^\"]+)\"$")`

			`row_lookup = {}`
			`package_id_by_key = {}`
			`for package in Package.query.all():`
			`package_id_by_key[f"{package.author.username}/{package.name}".lower()] = package.id`

			`log_files = [f for f in os.listdir(logs_dir) if os.path.isfile(os.path.join(logs_dir, f))]`

			`ua_is_bot = {}`


			`def my_open(path):`
			`if path.endswith(".gz"):`
			`return gzip.open(path, "rt")`
			`else:`
			`return open(path, "r")`


			`for log_file in log_files:`
			`print(f"Importing from {log_file}")`
			`with my_open(os.path.join(logs_dir, log_file)) as infile:`
			`line_no = 1`
			`for line in infile:`
			`if "/download/" not in line:`
			`continue`

			`line_no += 1`

			`url_match = url_re.search(line)`
			`if url_match is None:`
			`continue`

			`url = url_match.group(0)`
			`author = unquote(url_match.group(1))`
			`name = url_match.group(2)`

			`parsed_url = urlparse(url)`
			`reason = parse_qs(parsed_url.query).get("reason")`
			`if reason:`
			`reason = reason[0]`

			`dt_match = dt_re.search(line)`
			`assert dt_match`
			`dt = datetime.datetime.strptime(dt_match.group(1), "%d/%b/%Y:%H:%M:%S %z")`
			`dt = datetime.datetime.utcfromtimestamp(dt.timestamp())`
			`date = dt.date()`

			`if date >= datetime.date(2022, 11, 6):`
			`continue`

			`# print(line)`

			`ua_match = ua_re.search(line)`
			`if ua_match is None:`
			`print("No UA: " + line)`
			`continue`

			`ua = ua_match.group(1)`
			`is_bot = ua_is_bot.get(ua)`
			`if is_bot:`
			`continue`
			`if is_bot is None:`
			`user_agent = user_agents.parse(ua)`
			`ua_is_bot[ua] = user_agent.is_bot`
			`if user_agent.is_bot:`
			`continue`

			`package_key = f"{author}/{name}".lower()`
			`package_id = package_id_by_key.get(package_key)`
			`if package_id is None:`
			`print(f"Package not found: {package_key}")`
			`continue`

			`key = f"{date.isoformat()}/{package_id}"`
			`# print(author, name, reason, ua, date, key)`

			`row = row_lookup.get(key)`
			`if not row:`
			`row = PackageDailyStats()`
			`row.date = date`
			`row.package_id = package_id`

			`row.platform_minetest = 0`
			`row.platform_other = 0`
			`row.reason_new = 0`
			`row.reason_dependency = 0`
			`row.reason_update = 0`

			`db.session.add(row)`
			`row_lookup[key] = row`

Fix statistics reporting the 5.10 client as web (#568) 2024-11-22 20:02:36 +01:00			`if ua.startswith("Minetest/") or ua.startswith("Luanti/"):`
Add script to import stats from nginx logs 2022-11-06 17:54:06 +01:00			`row.platform_minetest += 1`
			`else:`
			`row.platform_other += 1`

			`if reason == "new":`
			`row.reason_new += 1`
			`elif reason == "dependency":`
			`row.reason_dependency += 1`
			`elif reason == "update":`
			`row.reason_update += 1`

			`# if line_no > 1000:`
			`# break`

			`db.session.commit()`