checking in work

2020-06-14 03:46:29 -04:00 · 2020-06-14 03:46:29 -04:00 · eed480c590
commit eed480c590
parent 2ba79cd801
8 changed files with 267 additions and 65 deletions
--- a/repomirror/init.py
+++ b/repomirror/init.py
@ -4,15 +4,8 @@ import logging
 ##
 from . import config
 from . import constants
+from . import sync


 _logger = logging.getLogger()

-
-class Sync(object):
-    def __init__(self, cfg = None, dummy = False, distro = None, logdir = None, *args, **kwargs):
-        _args = dict(locals())
-        del(_args['self'])
-        _logger.debug('Sync class instantiated with args: {0}'.format(_args))
-        self.cfg = config.Config(cfg)
-
--- a/repomirror/sync.py
+++ b/repomirror/sync.py
@ -0,0 +1,61 @@
+import datetime
+import logging
+import os
+##
+from . import config
+
+
+_logger = logging.getLogger()
+
+
+class Args(object):
+    def __init__(self, args_xml):
+        self.xml = args_xml
+        self.args = []
+        self._parse_xml()
+
+    def _parse_xml(self):
+        for arg_xml in self.xml.xpath('(short|long)'):
+
+
+
+class Mount(object):
+    def __init__(self, mpchk_xml):
+        self.path = os.path.abspath(os.path.expanduser(mpchk_xml))
+        self.is_mounted = None
+        self._check_mount()
+
+    def _check_mount(self):
+        with open('/proc/mounts', 'r') as fh:
+            raw = fh.read()
+        for line in raw.splitlines():
+            l = line.split()
+            mp = l[1]
+            if mp == self.path:
+                self.is_mounted = True
+                return(None)
+        self.is_mounted = False
+        return(None)
+
+
+class TimestampFile(object):
+    def __init__(self, ts_xml):
+        self.fmt = ts_xml.attrib.get('timeFormat', 'UNIX_EPOCH')
+        if self.fmt == 'UNIX_EPOCH':
+            self.fmt = '%s'
+        elif self.fmt == 'MICROSECOND_EPOCH':
+            self.fmt = '%s.%f'
+        self.path = os.path.abspath(os.path.expanduser(ts_xml.text))
+
+
+class Upstream(object):
+    def __init__(self, upstream_xml):
+        pass
+
+
+class Sync(object):
+    def __init__(self, cfg = None, dummy = False, distro = None, logdir = None, *args, **kwargs):
+        _args = dict(locals())
+        del(_args['self'])
+        _logger.debug('Sync class instantiated with args: {0}'.format(_args))
+        self.cfg = config.Config(cfg)
--- a/sample.config.xml
+++ b/sample.config.xml
@ -28,8 +28,8 @@
        * https://strftime.org/
      The default is to use a regular UNIX Epoch integer (e.g. June 13, 2020 5:03:53 PM UTC => 1592067833).
      This can be manually specified by the special string "UNIX_EPOCH".
-      Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with microseconds.
-      e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777
+      Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with left-padded
+      microseconds (e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777).
    -->
    <lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/arch.lastcheck</lastLocalCheck>
    <!--
@ -84,6 +84,10 @@
      <!--
        The following example uses "rsync://arch.mirror.constant.com/archlinux/"
        (https://www.archlinux.org/mirrors/constant.com/1008/)
+        If you need to find a mirror, you may be interested in the utils/find_fastest_upstream/ scripts. They will
+        automatically find (and sort based on connection speed) all mirrors in your country for a given distro.
+        They can even generate stubbed configuration files using those upstreams.
+        Currently only Arch Linux and CentOS are supported.
      -->
      <!--
        Required; one of:
--- a/utils/find_fastest_upstream/archlinux.py
+++ b/utils/find_fastest_upstream/archlinux.py
@ -1,57 +1,56 @@
 #!/usr/bin/env python3

+import argparse
 import datetime
-import re
-##
-import iso3166
 ##
 import classes


-_strip_re = re.compile(r'^\s*(?P<num>[0-9.]+).*$')
-
-
 class Ranker(classes.Ranker):
-    mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/'
+    mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/json/'
+    distro_name = 'archlinux'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.get_mirrors()
-        self.mycountry = iso3166.countries_by_alpha2[self.my_info['country']].name

    def extract_mirrors(self):
-        # Limit to only successful mirrors.
-        mirrors = self.bs.find('table', {'id': 'successful_mirrors'})
-        # Ayyy, thanks dude.
-        # Modified from https://stackoverflow.com/a/56835562/733214.
-        header = mirrors.find('thead').find('tr')
-        headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
-        raw_rows = mirrors.find_all('tr')
-        # rows = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
-        rows = [{headers[i]: cell for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
-        for r in rows:
-            for k, v in r.items():
-                print(v)
-                if k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
-                    r[k] = float(_strip_re.sub(r'\g<num>', v.text).strip())
-                elif k == 'μ Delay (hh:mm)':
-                    # HOO boy. Wish they just did it in seconds.
-                # elif k == 'Country':
-            self.raw_mirrors.append(r)
-        # for row in rows:
-        #     if not row:
-        #         continue
-        #     for k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
-        #         row[k] = float(_strip_re.sub(r'\g<num>', row[k]).strip())
-
+        for mirror in self.req.json()['urls']:
+            if not all((mirror['active'],  # Only successful/active mirrors
+                        mirror['isos'],  # Only mirrors with ISOs
+                        # Only mirrors that support rsync (Arch mirrors do not support ftp)
+                        (mirror['protocol'] == 'rsync'),
+                        # Only mirrors in the system's country (May be buggy if both are not ISO-3166-1 Alpha-2)
+                        (mirror['country_code'].upper() == self.my_info['country'].upper()),
+                        # Only mirrors that are at least 100% complete.
+                        (mirror['completion_pct'] >= 1.0))):
+                continue
+            # Convert the timestamp to python-native.
+            mirror['last_sync'] = datetime.datetime.strptime(mirror['last_sync'], '%Y-%m-%dT%H:%M:%SZ')
+            self.raw_mirrors.append(mirror)
+            self.mirror_candidates.append(mirror['url'])
        return(None)


+def parseArgs():
+    args = argparse.ArgumentParser(description = 'Generate a list of suitable Arch Linux upstream mirrors in order of '
+                                                 'speed')
+    args.add_argument('-x', '--xml',
+                      dest = 'xml',
+                      action = 'store_true',
+                      help = ('If specified, generate a config stub instead of a printed list of URLs'))
+    return(args)
+
+
 def main():
+    args = parseArgs().parse_args()
    r = Ranker()
    r.extract_mirrors()
-    import pprint
-    pprint.pprint(r.raw_mirrors)
+    r.speedcheck()
+    if args.xml:
+        print(r.gen_xml())
+    else:
+        r.print()
    return(None)


--- a/utils/find_fastest_upstream/centos.py
+++ b/utils/find_fastest_upstream/centos.py
@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import datetime
+import io
+import re
+##
+import classes
+
+
+_proto_re = re.compile(r'^(?P<proto>https?)(?P<uri>.*)')
+
+
+class Ranker(classes.Ranker):
+    # https://lists.centos.org/pipermail/centos-mirror/2017-March/010312.html
+    mirrorlist_url = 'https://www.centos.org/download/full-mirrorlist.csv'
+    distro_name = 'centos'
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.get_mirrors()
+
+    def extract_mirrors(self, preferred_proto = 'rsync'):
+        preferred_proto = preferred_proto.lower()
+        if preferred_proto not in ('rsync', 'ftp'):
+            raise ValueError('Invalid preferred_proto; must be one of rsync or ftp')
+        non_preferred = ('rsync' if preferred_proto == 'ftp' else 'ftp')
+        c = csv.DictReader(io.StringIO(self.raw_html), )
+        for row in c:
+            if not row['Country'] or row['Country'].strip() == '':
+                continue
+            # GorRAM it, dudes. States are not countries.
+            country = row['Country'].strip()
+            region = row['Region'].strip()
+            if region == 'US':
+                country = region
+            if country != self.my_info['country']:
+                continue
+            for k, v in row.items():
+                if v.strip() == '':
+                    row[k] = None
+            pref_url = row['{0} mirror link'.format(preferred_proto)]
+            nonpref_url = row['{0} mirror link'.format(non_preferred)]
+            if pref_url:
+                url = _proto_re.sub(r'{0}\g<uri>'.format(preferred_proto), pref_url)
+            else:
+                if not nonpref_url:
+                    continue
+                url = _proto_re.sub(r'{0}\g<uri>'.format(non_preferred), nonpref_url)
+            self.raw_mirrors.append(row)
+            self.mirror_candidates.append(url)
+        return(None)
+
+
+def parseArgs():
+    args = argparse.ArgumentParser(description = 'Generate a list of suitable CentOS upstream mirrors in order of '
+                                                 'speed')
+    args.add_argument('-x', '--xml',
+                      dest = 'xml',
+                      action = 'store_true',
+                      help = ('If specified, generate a config stub instead of a printed list of URLs'))
+    return(args)
+
+
+def main():
+    args = parseArgs().parse_args()
+    r = Ranker()
+    r.extract_mirrors()
+    r.speedcheck()
+    if args.xml:
+        print(r.gen_xml())
+    else:
+        r.print()
+    return(None)
+
+
+if __name__ == '__main__':
+    main()
--- a/utils/find_fastest_upstream/classes.py
+++ b/utils/find_fastest_upstream/classes.py
@ -1,22 +1,31 @@
 import socket
 import time
+from urllib import parse as urlparse
 ##
 import requests
 from bs4 import BeautifulSoup
+from lxml import etree
 ##
 import constants


 class Ranker(object):
    mirrorlist_url = None  # This is replaced by subclasses
+    distro_name = None

    def __init__(self, parser = 'lxml', *args, **kwargs):
        self.my_info = {}
        self.raw_html = None
        self.parser = parser
        self.bs = None
+        self.req = None
        self.get_myinfo()
+        # The native collection of mirror information.
        self.raw_mirrors = []
+        # The list of URLs only of the above.
+        self.mirror_candidates = []
+        self.ranked_mirrors = {}
+        self.ranked_urls = {}

    def extract_mirrors(self):
        # A dummy func. This should be overridden by subclasses.
@ -34,6 +43,79 @@ class Ranker(object):
        req = requests.get(self.mirrorlist_url)
        if not req.ok:
            raise RuntimeError('Could not contact information gatherer')
-        self.raw_html = req.content.decode('utf-8')
+        self.req = req
+        self.raw_html = self.req.content.decode('utf-8')
        self.bs = BeautifulSoup(self.raw_html, self.parser)
        return(None)
+
+    def speedcheck(self):
+        if not self.mirror_candidates:
+            self.extract_mirrors()
+        for url in self.mirror_candidates:
+            u = urlparse.urlparse(url)
+            sock = socket.socket()
+            sock.settimeout(7)
+            port = u.port
+            if not port:
+                port = constants.DEF_PORTS[u.scheme.lower()]
+            try:
+                start = time.perf_counter()
+                sock.connect((u.hostname, port))
+                conntime = time.perf_counter() - start  # in seconds
+                sock.close()
+                del(sock)
+            except (socket.timeout, socket.error):
+                continue
+            # Skip the mirror if it has an exact time in the mirrors already.
+            # Sure, it's *very* unlikely, but best practice to do this.
+            if conntime in self.ranked_mirrors:
+                continue
+            mirror = {}
+            for a in ('path', 'port'):
+                mirror[a] = getattr(u, a, None)
+            mirror['domain'] = u.hostname.lower()
+            mirror['syncType'] = u.scheme.lower()
+            if not mirror['port']:
+                mirror['port'] = constants.DEF_PORTS[mirror['syncType']]
+            if mirror['path'] == '':
+                mirror['path'] = '/'
+            self.ranked_mirrors[conntime] = mirror
+            self.ranked_urls[conntime] = url
+        return(None)
+
+    def print(self):
+        if not self.ranked_mirrors:
+            self.speedcheck()
+        print('Mirrors in order of speed:\n')
+        for m in sorted(list(self.ranked_urls.keys())):
+            print('{0}  # ({1} seconds to connect)'.format(self.ranked_urls[m], m))
+        return(None)
+
+    def gen_xml(self):
+        if not self.distro_name:
+            raise ValueError('This class must be subclassed to be useful')
+        if not self.ranked_mirrors:
+            self.speedcheck()
+        s = ('<?xml version="1.0" encoding="UTF-8" ?>'
+             '<mirror xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
+                     'xmlns="https://git.square-r00t.net/RepoMirror/" '
+                     'xsi:schemaLocation="https://git.square-r00t.net/RepoMirror/ '
+                                         'http://schema.xml.r00t2.io/projects/repomirror.xsd">'
+             '</mirror>')
+        xml = etree.fromstring(s.encode('utf-8'))
+        distro = etree.Element('distro')
+        distro.attrib['name'] = self.distro_name
+        for m in sorted(list(self.ranked_mirrors.keys())):
+            mirror = self.ranked_mirrors[m]
+            distro.append(etree.Comment(' ({0} seconds to connect) '.format(m)))
+            u = etree.SubElement(distro, 'upstream')
+            for k, v in mirror.items():
+                e = etree.SubElement(u, k)
+                e.text = str(v)
+        xml.append(distro)
+        return(etree.tostring(xml,
+                              pretty_print = True,
+                              with_comments = True,
+                              with_tail = True,
+                              encoding = 'UTF-8',
+                              xml_declaration = True).decode('utf-8'))
--- a/utils/find_fastest_upstream/constants.py
+++ b/utils/find_fastest_upstream/constants.py
@ -1 +1,5 @@
 MYINFO_URL = 'https://ipinfo.io'
+DEF_PORTS = {'ftp': 21,
+             'http': 80,
+             'https': 443,
+             'rsync': 873}
--- a/utils/find_fastest_upstream/test.py
+++ b/utils/find_fastest_upstream/test.py
@ -1,20 +0,0 @@
-#!/usr/bin/env python3
-
-import requests
-from bs4 import BeautifulSoup
-
-country = 'US'
-url = 'https://www.archlinux.org/mirrors/status/tier/1/'
-
-req = requests.get(url)
-html = req.content.decode('utf-8')
-bs = BeautifulSoup(html, 'lxml')
-
-mirrors = bs.find('table', {'id': 'successful_mirrors'})
-header = mirrors.find('thead').find('tr')
-headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
-
-results = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in mirrors.find_all('tr')]
-
-import pprint
-pprint.pprint(results)