msys2-web/app/fetch/utils.py
2024-01-12 17:40:21 +01:00

110 lines
3.9 KiB
Python

# Copyright 2016-2020 Christoph Reiter
# SPDX-License-Identifier: MIT
import asyncio
import datetime
import hashlib
import os
from email.utils import parsedate_to_datetime
from typing import Any, Optional
from urllib.parse import quote_plus, urlparse
import httpx
from .. import appconfig
from ..appconfig import REQUEST_TIMEOUT
from ..utils import logger
def get_mtime_for_response(response: httpx.Response) -> datetime.datetime | None:
last_modified = response.headers.get("last-modified")
if last_modified is not None:
dt: datetime.datetime = parsedate_to_datetime(last_modified)
return dt.astimezone(datetime.timezone.utc)
return None
async def get_content_cached_mtime(url: str, *args: Any, **kwargs: Any) -> tuple[bytes, datetime.datetime | None]:
"""Returns the content of the URL response, and a datetime object for when the content was last modified"""
# cache the file locally, and store the "last-modified" date as the file mtime
cache_dir = appconfig.CACHE_DIR
if cache_dir is None:
async with httpx.AsyncClient(follow_redirects=True) as client:
r = await client.get(url, *args, **kwargs)
r.raise_for_status()
return (r.content, get_mtime_for_response(r))
os.makedirs(cache_dir, exist_ok=True)
cache_fn = quote_plus(
(urlparse(url).hostname or "") +
"." + hashlib.sha256(url.encode()).hexdigest()[:16] +
".cache")
fn = os.path.join(cache_dir, cache_fn)
if not os.path.exists(fn):
async with httpx.AsyncClient(follow_redirects=True) as client:
r = await client.get(url, *args, **kwargs)
r.raise_for_status()
with open(fn, "wb") as h:
h.write(r.content)
mtime = get_mtime_for_response(r)
if mtime is not None:
os.utime(fn, (mtime.timestamp(), mtime.timestamp()))
with open(fn, "rb") as h:
data = h.read()
file_mtime = datetime.datetime.fromtimestamp(os.path.getmtime(fn), datetime.timezone.utc)
return (data, file_mtime)
async def get_content_cached(url: str, *args: Any, **kwargs: Any) -> bytes:
return (await get_content_cached_mtime(url, *args, **kwargs))[0]
CacheHeaders = dict[str, Optional[str]]
async def check_needs_update(urls: list[str], _cache: dict[str, CacheHeaders] = {}) -> bool:
"""Raises RequestException"""
if appconfig.CACHE_DIR:
return True
async def get_cache_headers(client: httpx.AsyncClient, url: str, timeout: float) -> tuple[str, CacheHeaders]:
"""This tries to return the cache response headers for a given URL as cheap as possible"""
old_headers = _cache.get(url, {})
last_modified = old_headers.get("last-modified")
etag = old_headers.get("etag")
fetch_headers = {}
if last_modified is not None:
fetch_headers["if-modified-since"] = last_modified
if etag is not None:
fetch_headers["if-none-match"] = etag
r = await client.head(url, timeout=timeout, headers=fetch_headers)
if r.status_code == 304:
return (url, dict(old_headers))
r.raise_for_status()
new_headers = {}
new_headers["last-modified"] = r.headers.get("last-modified")
new_headers["etag"] = r.headers.get("etag")
return (url, new_headers)
needs_update = False
async with httpx.AsyncClient(follow_redirects=True) as client:
awaitables = []
for url in urls:
awaitables.append(get_cache_headers(client, url, timeout=REQUEST_TIMEOUT))
for url, new_cache_headers in (await asyncio.gather(*awaitables)):
old_cache_headers = _cache.get(url, {})
if old_cache_headers != new_cache_headers:
needs_update = True
_cache[url] = new_cache_headers
logger.info(f"check needs update: {urls!r} -> {needs_update!r}")
return needs_update