feat: archive git repository (experimental)

See doc/git-archive.md for general Git archive specifications
See doc/repos/metadata-repo.md for info and direction related to the new Git metadata archive
This commit is contained in:
Kevin Morris 2022-09-24 16:51:25 +00:00
parent ec3152014b
commit 30e72d2db5
34 changed files with 1104 additions and 50 deletions

View file

@ -0,0 +1 @@
# aurweb.archives

View file

@ -0,0 +1 @@
# aurweb.archives.spec

View file

@ -0,0 +1,77 @@
from pathlib import Path
from typing import Any, Dict, Iterable, List, Set
class GitInfo:
"""Information about a Git repository."""
""" Path to Git repository. """
path: str
""" Local Git repository configuration. """
config: Dict[str, Any]
def __init__(self, path: str, config: Dict[str, Any] = dict()) -> "GitInfo":
self.path = Path(path)
self.config = config
class SpecOutput:
"""Class used for git_archive.py output details."""
""" Filename relative to the Git repository root. """
filename: Path
""" Git repository information. """
git_info: GitInfo
""" Bytes bound for `SpecOutput.filename`. """
data: bytes
def __init__(self, filename: str, git_info: GitInfo, data: bytes) -> "SpecOutput":
self.filename = filename
self.git_info = git_info
self.data = data
class SpecBase:
"""
Base for Spec classes defined in git_archve.py --spec modules.
All supported --spec modules must contain the following classes:
- Spec(SpecBase)
"""
""" A list of SpecOutputs, each of which contain output file data. """
outputs: List[SpecOutput] = list()
""" A set of repositories to commit changes to. """
repos: Set[str] = set()
def generate(self) -> Iterable[SpecOutput]:
"""
"Pure virtual" output generator.
`SpecBase.outputs` and `SpecBase.repos` should be populated within an
overridden version of this function in SpecBase derivatives.
"""
raise NotImplementedError()
def add_output(self, filename: str, git_info: GitInfo, data: bytes) -> None:
"""
Add a SpecOutput instance to the set of outputs.
:param filename: Filename relative to the git repository root
:param git_info: GitInfo instance
:param data: Binary data bound for `filename`
"""
if git_info.path not in self.repos:
self.repos.add(git_info.path)
self.outputs.append(
SpecOutput(
filename,
git_info,
data,
)
)

View file

@ -0,0 +1,85 @@
from typing import Iterable
import orjson
from aurweb import config, db
from aurweb.models import Package, PackageBase, User
from aurweb.rpc import RPC
from .base import GitInfo, SpecBase, SpecOutput
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
class Spec(SpecBase):
def __init__(self) -> "Spec":
self.metadata_repo = GitInfo(
config.get("git-archive", "metadata-repo"),
)
def generate(self) -> Iterable[SpecOutput]:
# Base query used by the RPC.
base_query = (
db.query(Package)
.join(PackageBase)
.join(User, PackageBase.MaintainerUID == User.ID)
)
# Create an instance of RPC, use it to get entities from
# our query and perform a metadata subquery for all packages.
rpc = RPC(version=5, type="info")
print("performing package database query")
packages = rpc.entities(base_query).all()
print("performing package database subqueries")
rpc.subquery({pkg.ID for pkg in packages})
pkgbases, pkgnames = dict(), dict()
for package in packages:
# Produce RPC type=info data for `package`
data = rpc.get_info_json_data(package)
pkgbase_name = data.get("PackageBase")
pkgbase_data = {
"ID": data.pop("PackageBaseID"),
"URLPath": data.pop("URLPath"),
"FirstSubmitted": data.pop("FirstSubmitted"),
"LastModified": data.pop("LastModified"),
"OutOfDate": data.pop("OutOfDate"),
"Maintainer": data.pop("Maintainer"),
"Keywords": data.pop("Keywords"),
"NumVotes": data.pop("NumVotes"),
"Popularity": data.pop("Popularity"),
"PopularityUpdated": package.PopularityUpdated.timestamp(),
}
# Store the data in `pkgbases` dict. We do this so we only
# end up processing a single `pkgbase` if repeated after
# this loop
pkgbases[pkgbase_name] = pkgbase_data
# Remove Popularity and NumVotes from package data.
# These fields change quite often which causes git data
# modification to explode.
# data.pop("NumVotes")
# data.pop("Popularity")
# Remove the ID key from package json.
data.pop("ID")
# Add the `package`.Name to the pkgnames set
name = data.get("Name")
pkgnames[name] = data
# Add metadata outputs
self.add_output(
"pkgname.json",
self.metadata_repo,
orjson.dumps(pkgnames, option=ORJSON_OPTS),
)
self.add_output(
"pkgbase.json",
self.metadata_repo,
orjson.dumps(pkgbases, option=ORJSON_OPTS),
)
return self.outputs

View file

@ -0,0 +1,32 @@
from typing import Iterable
import orjson
from aurweb import config, db
from aurweb.models import PackageBase
from .base import GitInfo, SpecBase, SpecOutput
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
class Spec(SpecBase):
def __init__(self) -> "Spec":
self.pkgbases_repo = GitInfo(config.get("git-archive", "pkgbases-repo"))
def generate(self) -> Iterable[SpecOutput]:
filt = PackageBase.PackagerUID.isnot(None)
query = (
db.query(PackageBase.Name)
.filter(filt)
.order_by(PackageBase.Name.asc())
.all()
)
pkgbases = [pkgbase.Name for pkgbase in query]
self.add_output(
"pkgbase.json",
self.pkgbases_repo,
orjson.dumps(pkgbases, option=ORJSON_OPTS),
)
return self.outputs

View file

@ -0,0 +1,33 @@
from typing import Iterable
import orjson
from aurweb import config, db
from aurweb.models import Package, PackageBase
from .base import GitInfo, SpecBase, SpecOutput
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
class Spec(SpecBase):
def __init__(self) -> "Spec":
self.pkgnames_repo = GitInfo(config.get("git-archive", "pkgnames-repo"))
def generate(self) -> Iterable[SpecOutput]:
filt = PackageBase.PackagerUID.isnot(None)
query = (
db.query(Package.Name)
.join(PackageBase, PackageBase.ID == Package.PackageBaseID)
.filter(filt)
.order_by(Package.Name.asc())
.all()
)
pkgnames = [pkg.Name for pkg in query]
self.add_output(
"pkgname.json",
self.pkgnames_repo,
orjson.dumps(pkgnames, option=ORJSON_OPTS),
)
return self.outputs

View file

@ -0,0 +1,26 @@
from typing import Iterable
import orjson
from aurweb import config, db
from aurweb.models import User
from .base import GitInfo, SpecBase, SpecOutput
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
class Spec(SpecBase):
def __init__(self) -> "Spec":
self.users_repo = GitInfo(config.get("git-archive", "users-repo"))
def generate(self) -> Iterable[SpecOutput]:
query = db.query(User.Username).order_by(User.Username.asc()).all()
users = [user.Username for user in query]
self.add_output(
"users.json",
self.users_repo,
orjson.dumps(users, option=ORJSON_OPTS),
)
return self.outputs

View file

@ -64,3 +64,13 @@ class PackageBase(Base):
if key in PackageBase.TO_FLOAT and not isinstance(attr, float):
return float(attr)
return attr
def popularity_decay(pkgbase: PackageBase, utcnow: int):
"""Return the delta between now and the last time popularity was updated, in days"""
return int((utcnow - pkgbase.PopularityUpdated.timestamp()) / 86400)
def popularity(pkgbase: PackageBase, utcnow: int):
"""Return up-to-date popularity"""
return float(pkgbase.Popularity) * (0.98 ** popularity_decay(pkgbase, utcnow))

View file

@ -3,8 +3,9 @@ from typing import Any
from fastapi import Request
from sqlalchemy import and_
from aurweb import config, db, defaults, l10n, util
from aurweb import config, db, defaults, l10n, time, util
from aurweb.models import PackageBase, User
from aurweb.models.package_base import popularity
from aurweb.models.package_comaintainer import PackageComaintainer
from aurweb.models.package_comment import PackageComment
from aurweb.models.package_request import PENDING_ID, PackageRequest
@ -81,6 +82,8 @@ def make_context(
and_(PackageRequest.Status == PENDING_ID, PackageRequest.ClosedTS.is_(None))
).count()
context["popularity"] = popularity(pkgbase, time.utcnow())
return context

View file

@ -6,9 +6,10 @@ from fastapi.responses import HTMLResponse
from sqlalchemy import and_, literal, orm
import aurweb.config as config
from aurweb import db, defaults, models
from aurweb import db, defaults, models, time
from aurweb.exceptions import RPCError
from aurweb.filters import number_format
from aurweb.models.package_base import popularity
from aurweb.packages.search import RPCSearch
TYPE_MAPPING = {
@ -120,16 +121,15 @@ class RPC:
if not args:
raise RPCError("No request type/data specified.")
def _get_json_data(self, package: models.Package) -> dict[str, Any]:
def get_json_data(self, package: models.Package) -> dict[str, Any]:
"""Produce dictionary data of one Package that can be JSON-serialized.
:param package: Package instance
:returns: JSON-serializable dictionary
"""
# Produce RPC API compatible Popularity: If zero, it's an integer
# 0, otherwise, it's formatted to the 6th decimal place.
pop = package.Popularity
# Normalize Popularity for RPC output to 6 decimal precision
pop = popularity(package, time.utcnow())
pop = 0 if not pop else float(number_format(pop, 6))
snapshot_uri = config.get("options", "snapshot_uri")
@ -151,8 +151,8 @@ class RPC:
"LastModified": package.ModifiedTS,
}
def _get_info_json_data(self, package: models.Package) -> dict[str, Any]:
data = self._get_json_data(package)
def get_info_json_data(self, package: models.Package) -> dict[str, Any]:
data = self.get_json_data(package)
# All info results have _at least_ an empty list of
# License and Keywords.
@ -176,7 +176,7 @@ class RPC:
"""
return [data_generator(pkg) for pkg in packages]
def _entities(self, query: orm.Query) -> orm.Query:
def entities(self, query: orm.Query) -> orm.Query:
"""Select specific RPC columns on `query`."""
return query.with_entities(
models.Package.ID,
@ -188,38 +188,14 @@ class RPC:
models.PackageBase.Name.label("PackageBaseName"),
models.PackageBase.NumVotes,
models.PackageBase.Popularity,
models.PackageBase.PopularityUpdated,
models.PackageBase.OutOfDateTS,
models.PackageBase.SubmittedTS,
models.PackageBase.ModifiedTS,
models.User.Username.label("Maintainer"),
).group_by(models.Package.ID)
def _handle_multiinfo_type(
self, args: list[str] = [], **kwargs
) -> list[dict[str, Any]]:
self._enforce_args(args)
args = set(args)
packages = (
db.query(models.Package)
.join(models.PackageBase)
.join(
models.User,
models.User.ID == models.PackageBase.MaintainerUID,
isouter=True,
)
.filter(models.Package.Name.in_(args))
)
max_results = config.getint("options", "max_rpc_results")
packages = self._entities(packages).limit(max_results + 1)
if packages.count() > max_results:
raise RPCError("Too many package results.")
ids = {pkg.ID for pkg in packages}
# Aliases for 80-width.
def subquery(self, ids: set[int]):
Package = models.Package
PackageKeyword = models.PackageKeyword
@ -311,7 +287,33 @@ class RPC:
self.extra_info[record.ID][type_].append(name)
return self._assemble_json_data(packages, self._get_info_json_data)
def _handle_multiinfo_type(
self, args: list[str] = [], **kwargs
) -> list[dict[str, Any]]:
self._enforce_args(args)
args = set(args)
packages = (
db.query(models.Package)
.join(models.PackageBase)
.join(
models.User,
models.User.ID == models.PackageBase.MaintainerUID,
isouter=True,
)
.filter(models.Package.Name.in_(args))
)
max_results = config.getint("options", "max_rpc_results")
packages = self.entities(packages).limit(max_results + 1)
if packages.count() > max_results:
raise RPCError("Too many package results.")
ids = {pkg.ID for pkg in packages}
self.subquery(ids)
return self._assemble_json_data(packages, self.get_info_json_data)
def _handle_search_type(
self, by: str = defaults.RPC_SEARCH_BY, args: list[str] = []
@ -330,12 +332,12 @@ class RPC:
search.search_by(by, arg)
max_results = config.getint("options", "max_rpc_results")
results = self._entities(search.results()).limit(max_results + 1).all()
results = self.entities(search.results()).limit(max_results + 1).all()
if len(results) > max_results:
raise RPCError("Too many package results.")
return self._assemble_json_data(results, self._get_json_data)
return self._assemble_json_data(results, self.get_json_data)
def _handle_msearch_type(
self, args: list[str] = [], **kwargs

View file

@ -155,6 +155,12 @@ PackageBases = Table(
nullable=False,
server_default=text("0"),
),
Column(
"PopularityUpdated",
TIMESTAMP,
nullable=False,
server_default=text("'1970-01-01 00:00:01.000000'"),
),
Column("OutOfDateTS", BIGINT(unsigned=True)),
Column("FlaggerComment", Text, nullable=False),
Column("SubmittedTS", BIGINT(unsigned=True), nullable=False),

View file

@ -0,0 +1,125 @@
import argparse
import importlib
import os
import sys
import traceback
from datetime import datetime
import orjson
import pygit2
from aurweb import config
# Constants
REF = "refs/heads/master"
ORJSON_OPTS = orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2
def init_repository(git_info) -> None:
pygit2.init_repository(git_info.path)
repo = pygit2.Repository(git_info.path)
for k, v in git_info.config.items():
repo.config[k] = v
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--spec",
type=str,
required=True,
help="name of spec module in the aurweb.archives.spec package",
)
return parser.parse_args()
def update_repository(repo: pygit2.Repository):
# Use git status to determine file changes
has_changes = False
changes = repo.status()
for filepath, flags in changes.items():
if flags != pygit2.GIT_STATUS_CURRENT:
has_changes = True
break
if has_changes:
print("diff detected, committing")
# Add everything in the tree.
print("adding files to git tree")
# Add the tree to staging
repo.index.read()
repo.index.add_all()
repo.index.write()
tree = repo.index.write_tree()
# Determine base commit; if repo.head.target raises GitError,
# we have no current commits
try:
base = [repo.head.target]
except pygit2.GitError:
base = []
utcnow = datetime.utcnow()
author = pygit2.Signature(
config.get("git-archive", "author"),
config.get("git-archive", "author-email"),
int(utcnow.timestamp()),
0,
)
# Commit the changes
timestamp = utcnow.strftime("%Y-%m-%d %H:%M:%S")
title = f"update - {timestamp}"
repo.create_commit(REF, author, author, title, tree, base)
print("committed changes")
else:
print("no diff detected")
def main() -> int:
args = parse_args()
print(f"loading '{args.spec}' spec")
spec_package = "aurweb.archives.spec"
module_path = f"{spec_package}.{args.spec}"
spec_module = importlib.import_module(module_path)
print(f"loaded '{args.spec}'")
# Track repositories that the spec modifies. After we run
# through specs, we want to make a single commit for all
# repositories that contain changes.
repos = dict()
print(f"running '{args.spec}' spec...")
spec = spec_module.Spec()
for output in spec.generate():
if not os.path.exists(output.git_info.path / ".git"):
init_repository(output.git_info)
path = output.git_info.path / output.filename
with open(path, "wb") as f:
f.write(output.data)
if output.git_info.path not in repos:
repos[output.git_info.path] = pygit2.Repository(output.git_info.path)
print(f"done running '{args.spec}' spec")
print("processing repositories")
for path in spec.repos:
print(f"processing repository: {path}")
update_repository(pygit2.Repository(path))
return 0
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
sys.exit(0)
except Exception:
traceback.print_exc()
sys.exit(1)

View file

@ -188,6 +188,7 @@ def _main():
USERS = aurweb.config.get("mkpkglists", "userfile")
bench = Benchmark()
logger.warning(f"{sys.argv[0]} is deprecated and will be soon be removed")
logger.info("Started re-creating archives, wait a while...")
query = (

View file

@ -1,9 +1,10 @@
#!/usr/bin/env python3
from datetime import datetime
from sqlalchemy import and_, func
from sqlalchemy.sql.functions import coalesce, sum as _sum
from aurweb import db, time
from aurweb import config, db, time
from aurweb.models import PackageBase, PackageVote
@ -46,13 +47,24 @@ def run_variable(pkgbases: list[PackageBase] = []) -> None:
ids = set()
if pkgbases:
# If `pkgbases` were given, we should forcefully update the given
# package base records' popularities.
ids = {pkgbase.ID for pkgbase in pkgbases}
query = query.filter(PackageBase.ID.in_(ids))
else:
# Otherwise, we should only update popularities which have exceeded
# the popularity interval length.
interval = config.getint("git-archive", "popularity-interval")
query = query.filter(
PackageBase.PopularityUpdated
<= datetime.fromtimestamp((now - interval))
)
query.update(
{
"NumVotes": votes_subq.scalar_subquery(),
"Popularity": pop_subq.scalar_subquery(),
"PopularityUpdated": datetime.fromtimestamp(now),
}
)

View file

@ -1,6 +1,4 @@
import os
import shlex
from subprocess import PIPE, Popen
from typing import Tuple
import py
@ -8,6 +6,7 @@ import py
from aurweb.models import Package
from aurweb.templates import base_template
from aurweb.testing.filelock import FileLock
from aurweb.util import shell_exec
class GitRepository:
@ -24,10 +23,7 @@ class GitRepository:
self.file_lock.lock(on_create=self._setup)
def _exec(self, cmdline: str, cwd: str) -> Tuple[int, str, str]:
args = shlex.split(cmdline)
proc = Popen(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
out, err = proc.communicate()
return (proc.returncode, out.decode().strip(), err.decode().strip())
return shell_exec(cmdline, cwd)
def _exec_repository(self, cmdline: str) -> Tuple[int, str, str]:
return self._exec(cmdline, cwd=str(self.file_lock.path))

View file

@ -1,6 +1,7 @@
import math
import re
import secrets
import shlex
import string
from datetime import datetime
from http import HTTPStatus
@ -192,3 +193,10 @@ def parse_ssh_key(string: str) -> Tuple[str, str]:
def parse_ssh_keys(string: str) -> list[Tuple[str, str]]:
"""Parse a list of SSH public keys."""
return [parse_ssh_key(e) for e in string.splitlines()]
def shell_exec(cmdline: str, cwd: str) -> Tuple[int, str, str]:
args = shlex.split(cmdline)
proc = Popen(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
out, err = proc.communicate()
return (proc.returncode, out.decode().strip(), err.decode().strip())