checking in work

This commit is contained in:
brent s. 2020-06-14 03:46:29 -04:00
parent 2ba79cd801
commit eed480c590
Signed by: bts
GPG Key ID: 8C004C2F93481F6B
8 changed files with 267 additions and 65 deletions

View File

@ -4,15 +4,8 @@ import logging
##
from . import config
from . import constants
from . import sync


_logger = logging.getLogger()


class Sync(object):
def __init__(self, cfg = None, dummy = False, distro = None, logdir = None, *args, **kwargs):
_args = dict(locals())
del(_args['self'])
_logger.debug('Sync class instantiated with args: {0}'.format(_args))
self.cfg = config.Config(cfg)


61
repomirror/sync.py Normal file
View File

@ -0,0 +1,61 @@
import datetime
import logging
import os
##
from . import config


_logger = logging.getLogger()


class Args(object):
def __init__(self, args_xml):
self.xml = args_xml
self.args = []
self._parse_xml()

def _parse_xml(self):
for arg_xml in self.xml.xpath('(short|long)'):



class Mount(object):
def __init__(self, mpchk_xml):
self.path = os.path.abspath(os.path.expanduser(mpchk_xml))
self.is_mounted = None
self._check_mount()

def _check_mount(self):
with open('/proc/mounts', 'r') as fh:
raw = fh.read()
for line in raw.splitlines():
l = line.split()
mp = l[1]
if mp == self.path:
self.is_mounted = True
return(None)
self.is_mounted = False
return(None)


class TimestampFile(object):
def __init__(self, ts_xml):
self.fmt = ts_xml.attrib.get('timeFormat', 'UNIX_EPOCH')
if self.fmt == 'UNIX_EPOCH':
self.fmt = '%s'
elif self.fmt == 'MICROSECOND_EPOCH':
self.fmt = '%s.%f'
self.path = os.path.abspath(os.path.expanduser(ts_xml.text))


class Upstream(object):
def __init__(self, upstream_xml):
pass


class Sync(object):
def __init__(self, cfg = None, dummy = False, distro = None, logdir = None, *args, **kwargs):
_args = dict(locals())
del(_args['self'])
_logger.debug('Sync class instantiated with args: {0}'.format(_args))
self.cfg = config.Config(cfg)

View File

@ -28,8 +28,8 @@
* https://strftime.org/
The default is to use a regular UNIX Epoch integer (e.g. June 13, 2020 5:03:53 PM UTC => 1592067833).
This can be manually specified by the special string "UNIX_EPOCH".
Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with microseconds.
e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777
Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with left-padded
microseconds (e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777).
-->
<lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/arch.lastcheck</lastLocalCheck>
<!--
@ -84,6 +84,10 @@
<!--
The following example uses "rsync://arch.mirror.constant.com/archlinux/"
(https://www.archlinux.org/mirrors/constant.com/1008/)
If you need to find a mirror, you may be interested in the utils/find_fastest_upstream/ scripts. They will
automatically find (and sort based on connection speed) all mirrors in your country for a given distro.
They can even generate stubbed configuration files using those upstreams.
Currently only Arch Linux and CentOS are supported.
-->
<!--
Required; one of:

View File

@ -1,57 +1,56 @@
#!/usr/bin/env python3

import argparse
import datetime
import re
##
import iso3166
##
import classes


_strip_re = re.compile(r'^\s*(?P<num>[0-9.]+).*$')


class Ranker(classes.Ranker):
mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/'
mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/json/'
distro_name = 'archlinux'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.get_mirrors()
self.mycountry = iso3166.countries_by_alpha2[self.my_info['country']].name

def extract_mirrors(self):
# Limit to only successful mirrors.
mirrors = self.bs.find('table', {'id': 'successful_mirrors'})
# Ayyy, thanks dude.
# Modified from https://stackoverflow.com/a/56835562/733214.
header = mirrors.find('thead').find('tr')
headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
raw_rows = mirrors.find_all('tr')
# rows = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
rows = [{headers[i]: cell for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
for r in rows:
for k, v in r.items():
print(v)
if k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
r[k] = float(_strip_re.sub(r'\g<num>', v.text).strip())
elif k == 'μ Delay (hh:mm)':
# HOO boy. Wish they just did it in seconds.
# elif k == 'Country':
self.raw_mirrors.append(r)
# for row in rows:
# if not row:
# continue
# for k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
# row[k] = float(_strip_re.sub(r'\g<num>', row[k]).strip())

for mirror in self.req.json()['urls']:
if not all((mirror['active'], # Only successful/active mirrors
mirror['isos'], # Only mirrors with ISOs
# Only mirrors that support rsync (Arch mirrors do not support ftp)
(mirror['protocol'] == 'rsync'),
# Only mirrors in the system's country (May be buggy if both are not ISO-3166-1 Alpha-2)
(mirror['country_code'].upper() == self.my_info['country'].upper()),
# Only mirrors that are at least 100% complete.
(mirror['completion_pct'] >= 1.0))):
continue
# Convert the timestamp to python-native.
mirror['last_sync'] = datetime.datetime.strptime(mirror['last_sync'], '%Y-%m-%dT%H:%M:%SZ')
self.raw_mirrors.append(mirror)
self.mirror_candidates.append(mirror['url'])
return(None)


def parseArgs():
args = argparse.ArgumentParser(description = 'Generate a list of suitable Arch Linux upstream mirrors in order of '
'speed')
args.add_argument('-x', '--xml',
dest = 'xml',
action = 'store_true',
help = ('If specified, generate a config stub instead of a printed list of URLs'))
return(args)


def main():
args = parseArgs().parse_args()
r = Ranker()
r.extract_mirrors()
import pprint
pprint.pprint(r.raw_mirrors)
r.speedcheck()
if args.xml:
print(r.gen_xml())
else:
r.print()
return(None)



View File

@ -0,0 +1,79 @@
#!/usr/bin/env python3

import argparse
import csv
import datetime
import io
import re
##
import classes


_proto_re = re.compile(r'^(?P<proto>https?)(?P<uri>.*)')


class Ranker(classes.Ranker):
# https://lists.centos.org/pipermail/centos-mirror/2017-March/010312.html
mirrorlist_url = 'https://www.centos.org/download/full-mirrorlist.csv'
distro_name = 'centos'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.get_mirrors()

def extract_mirrors(self, preferred_proto = 'rsync'):
preferred_proto = preferred_proto.lower()
if preferred_proto not in ('rsync', 'ftp'):
raise ValueError('Invalid preferred_proto; must be one of rsync or ftp')
non_preferred = ('rsync' if preferred_proto == 'ftp' else 'ftp')
c = csv.DictReader(io.StringIO(self.raw_html), )
for row in c:
if not row['Country'] or row['Country'].strip() == '':
continue
# GorRAM it, dudes. States are not countries.
country = row['Country'].strip()
region = row['Region'].strip()
if region == 'US':
country = region
if country != self.my_info['country']:
continue
for k, v in row.items():
if v.strip() == '':
row[k] = None
pref_url = row['{0} mirror link'.format(preferred_proto)]
nonpref_url = row['{0} mirror link'.format(non_preferred)]
if pref_url:
url = _proto_re.sub(r'{0}\g<uri>'.format(preferred_proto), pref_url)
else:
if not nonpref_url:
continue
url = _proto_re.sub(r'{0}\g<uri>'.format(non_preferred), nonpref_url)
self.raw_mirrors.append(row)
self.mirror_candidates.append(url)
return(None)


def parseArgs():
args = argparse.ArgumentParser(description = 'Generate a list of suitable CentOS upstream mirrors in order of '
'speed')
args.add_argument('-x', '--xml',
dest = 'xml',
action = 'store_true',
help = ('If specified, generate a config stub instead of a printed list of URLs'))
return(args)


def main():
args = parseArgs().parse_args()
r = Ranker()
r.extract_mirrors()
r.speedcheck()
if args.xml:
print(r.gen_xml())
else:
r.print()
return(None)


if __name__ == '__main__':
main()

View File

@ -1,22 +1,31 @@
import socket
import time
from urllib import parse as urlparse
##
import requests
from bs4 import BeautifulSoup
from lxml import etree
##
import constants


class Ranker(object):
mirrorlist_url = None # This is replaced by subclasses
distro_name = None

def __init__(self, parser = 'lxml', *args, **kwargs):
self.my_info = {}
self.raw_html = None
self.parser = parser
self.bs = None
self.req = None
self.get_myinfo()
# The native collection of mirror information.
self.raw_mirrors = []
# The list of URLs only of the above.
self.mirror_candidates = []
self.ranked_mirrors = {}
self.ranked_urls = {}

def extract_mirrors(self):
# A dummy func. This should be overridden by subclasses.
@ -34,6 +43,79 @@ class Ranker(object):
req = requests.get(self.mirrorlist_url)
if not req.ok:
raise RuntimeError('Could not contact information gatherer')
self.raw_html = req.content.decode('utf-8')
self.req = req
self.raw_html = self.req.content.decode('utf-8')
self.bs = BeautifulSoup(self.raw_html, self.parser)
return(None)

def speedcheck(self):
if not self.mirror_candidates:
self.extract_mirrors()
for url in self.mirror_candidates:
u = urlparse.urlparse(url)
sock = socket.socket()
sock.settimeout(7)
port = u.port
if not port:
port = constants.DEF_PORTS[u.scheme.lower()]
try:
start = time.perf_counter()
sock.connect((u.hostname, port))
conntime = time.perf_counter() - start # in seconds
sock.close()
del(sock)
except (socket.timeout, socket.error):
continue
# Skip the mirror if it has an exact time in the mirrors already.
# Sure, it's *very* unlikely, but best practice to do this.
if conntime in self.ranked_mirrors:
continue
mirror = {}
for a in ('path', 'port'):
mirror[a] = getattr(u, a, None)
mirror['domain'] = u.hostname.lower()
mirror['syncType'] = u.scheme.lower()
if not mirror['port']:
mirror['port'] = constants.DEF_PORTS[mirror['syncType']]
if mirror['path'] == '':
mirror['path'] = '/'
self.ranked_mirrors[conntime] = mirror
self.ranked_urls[conntime] = url
return(None)

def print(self):
if not self.ranked_mirrors:
self.speedcheck()
print('Mirrors in order of speed:\n')
for m in sorted(list(self.ranked_urls.keys())):
print('{0} # ({1} seconds to connect)'.format(self.ranked_urls[m], m))
return(None)

def gen_xml(self):
if not self.distro_name:
raise ValueError('This class must be subclassed to be useful')
if not self.ranked_mirrors:
self.speedcheck()
s = ('<?xml version="1.0" encoding="UTF-8" ?>'
'<mirror xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
'xmlns="https://git.square-r00t.net/RepoMirror/" '
'xsi:schemaLocation="https://git.square-r00t.net/RepoMirror/ '
'http://schema.xml.r00t2.io/projects/repomirror.xsd">'
'</mirror>')
xml = etree.fromstring(s.encode('utf-8'))
distro = etree.Element('distro')
distro.attrib['name'] = self.distro_name
for m in sorted(list(self.ranked_mirrors.keys())):
mirror = self.ranked_mirrors[m]
distro.append(etree.Comment(' ({0} seconds to connect) '.format(m)))
u = etree.SubElement(distro, 'upstream')
for k, v in mirror.items():
e = etree.SubElement(u, k)
e.text = str(v)
xml.append(distro)
return(etree.tostring(xml,
pretty_print = True,
with_comments = True,
with_tail = True,
encoding = 'UTF-8',
xml_declaration = True).decode('utf-8'))

View File

@ -1 +1,5 @@
MYINFO_URL = 'https://ipinfo.io'
DEF_PORTS = {'ftp': 21,
'http': 80,
'https': 443,
'rsync': 873}

View File

@ -1,20 +0,0 @@
#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup

country = 'US'
url = 'https://www.archlinux.org/mirrors/status/tier/1/'

req = requests.get(url)
html = req.content.decode('utf-8')
bs = BeautifulSoup(html, 'lxml')

mirrors = bs.find('table', {'id': 'successful_mirrors'})
header = mirrors.find('thead').find('tr')
headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]

results = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in mirrors.find_all('tr')]

import pprint
pprint.pprint(results)