summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbrent s <bts@square-r00t.net>2020-06-14 03:46:29 -0400
committerbrent s <bts@square-r00t.net>2020-06-14 03:46:29 -0400
commiteed480c590927f5128a1e6490383a2ef1787bf57 (patch)
treeebb3a71b65558449c9c44fb590152185fa811135
parent2ba79cd801f3a87fb6f785a8e9fdf0f6a2508cb2 (diff)
downloadRepoMirror-eed480c590927f5128a1e6490383a2ef1787bf57.tar.xz
checking in work
-rw-r--r--repomirror/__init__.py9
-rw-r--r--repomirror/sync.py61
-rw-r--r--sample.config.xml8
-rwxr-xr-xutils/find_fastest_upstream/archlinux.py67
-rwxr-xr-xutils/find_fastest_upstream/centos.py79
-rw-r--r--utils/find_fastest_upstream/classes.py84
-rw-r--r--utils/find_fastest_upstream/constants.py4
-rwxr-xr-xutils/find_fastest_upstream/test.py20
8 files changed, 267 insertions, 65 deletions
diff --git a/repomirror/__init__.py b/repomirror/__init__.py
index e309666..725b28d 100644
--- a/repomirror/__init__.py
+++ b/repomirror/__init__.py
@@ -4,15 +4,8 @@ import logging
##
from . import config
from . import constants
+from . import sync
_logger = logging.getLogger()
-
-class Sync(object):
- def __init__(self, cfg = None, dummy = False, distro = None, logdir = None, *args, **kwargs):
- _args = dict(locals())
- del(_args['self'])
- _logger.debug('Sync class instantiated with args: {0}'.format(_args))
- self.cfg = config.Config(cfg)
-
diff --git a/repomirror/sync.py b/repomirror/sync.py
new file mode 100644
index 0000000..885c505
--- /dev/null
+++ b/repomirror/sync.py
@@ -0,0 +1,61 @@
+import datetime
+import logging
+import os
+##
+from . import config
+
+
+_logger = logging.getLogger()
+
+
+class Args(object):
+ def __init__(self, args_xml):
+ self.xml = args_xml
+ self.args = []
+ self._parse_xml()
+
+ def _parse_xml(self):
+ for arg_xml in self.xml.xpath('(short|long)'):
+
+
+
+class Mount(object):
+ def __init__(self, mpchk_xml):
+ self.path = os.path.abspath(os.path.expanduser(mpchk_xml))
+ self.is_mounted = None
+ self._check_mount()
+
+ def _check_mount(self):
+ with open('/proc/mounts', 'r') as fh:
+ raw = fh.read()
+ for line in raw.splitlines():
+ l = line.split()
+ mp = l[1]
+ if mp == self.path:
+ self.is_mounted = True
+ return(None)
+ self.is_mounted = False
+ return(None)
+
+
+class TimestampFile(object):
+ def __init__(self, ts_xml):
+ self.fmt = ts_xml.attrib.get('timeFormat', 'UNIX_EPOCH')
+ if self.fmt == 'UNIX_EPOCH':
+ self.fmt = '%s'
+ elif self.fmt == 'MICROSECOND_EPOCH':
+ self.fmt = '%s.%f'
+ self.path = os.path.abspath(os.path.expanduser(ts_xml.text))
+
+
+class Upstream(object):
+ def __init__(self, upstream_xml):
+ pass
+
+
+class Sync(object):
+ def __init__(self, cfg = None, dummy = False, distro = None, logdir = None, *args, **kwargs):
+ _args = dict(locals())
+ del(_args['self'])
+ _logger.debug('Sync class instantiated with args: {0}'.format(_args))
+ self.cfg = config.Config(cfg)
diff --git a/sample.config.xml b/sample.config.xml
index 4e4c692..14c60a8 100644
--- a/sample.config.xml
+++ b/sample.config.xml
@@ -28,8 +28,8 @@
* https://strftime.org/
The default is to use a regular UNIX Epoch integer (e.g. June 13, 2020 5:03:53 PM UTC => 1592067833).
This can be manually specified by the special string "UNIX_EPOCH".
- Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with microseconds.
- e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777
+ Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with left-padded
+ microseconds (e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777).
-->
<lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/arch.lastcheck</lastLocalCheck>
<!--
@@ -84,6 +84,10 @@
<!--
The following example uses "rsync://arch.mirror.constant.com/archlinux/"
(https://www.archlinux.org/mirrors/constant.com/1008/)
+ If you need to find a mirror, you may be interested in the utils/find_fastest_upstream/ scripts. They will
+ automatically find (and sort based on connection speed) all mirrors in your country for a given distro.
+ They can even generate stubbed configuration files using those upstreams.
+ Currently only Arch Linux and CentOS are supported.
-->
<!--
Required; one of:
diff --git a/utils/find_fastest_upstream/archlinux.py b/utils/find_fastest_upstream/archlinux.py
index 32fc1ce..8cc7ca8 100755
--- a/utils/find_fastest_upstream/archlinux.py
+++ b/utils/find_fastest_upstream/archlinux.py
@@ -1,57 +1,56 @@
#!/usr/bin/env python3
+import argparse
import datetime
-import re
-##
-import iso3166
##
import classes
-_strip_re = re.compile(r'^\s*(?P<num>[0-9.]+).*$')
-
-
class Ranker(classes.Ranker):
- mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/'
+ mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/json/'
+ distro_name = 'archlinux'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.get_mirrors()
- self.mycountry = iso3166.countries_by_alpha2[self.my_info['country']].name
def extract_mirrors(self):
- # Limit to only successful mirrors.
- mirrors = self.bs.find('table', {'id': 'successful_mirrors'})
- # Ayyy, thanks dude.
- # Modified from https://stackoverflow.com/a/56835562/733214.
- header = mirrors.find('thead').find('tr')
- headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
- raw_rows = mirrors.find_all('tr')
- # rows = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
- rows = [{headers[i]: cell for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
- for r in rows:
- for k, v in r.items():
- print(v)
- if k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
- r[k] = float(_strip_re.sub(r'\g<num>', v.text).strip())
- elif k == 'μ Delay (hh:mm)':
- # HOO boy. Wish they just did it in seconds.
- # elif k == 'Country':
- self.raw_mirrors.append(r)
- # for row in rows:
- # if not row:
- # continue
- # for k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
- # row[k] = float(_strip_re.sub(r'\g<num>', row[k]).strip())
-
+ for mirror in self.req.json()['urls']:
+ if not all((mirror['active'], # Only successful/active mirrors
+ mirror['isos'], # Only mirrors with ISOs
+ # Only mirrors that support rsync (Arch mirrors do not support ftp)
+ (mirror['protocol'] == 'rsync'),
+ # Only mirrors in the system's country (May be buggy if both are not ISO-3166-1 Alpha-2)
+ (mirror['country_code'].upper() == self.my_info['country'].upper()),
+ # Only mirrors that are at least 100% complete.
+ (mirror['completion_pct'] >= 1.0))):
+ continue
+ # Convert the timestamp to python-native.
+ mirror['last_sync'] = datetime.datetime.strptime(mirror['last_sync'], '%Y-%m-%dT%H:%M:%SZ')
+ self.raw_mirrors.append(mirror)
+ self.mirror_candidates.append(mirror['url'])
return(None)
+def parseArgs():
+ args = argparse.ArgumentParser(description = 'Generate a list of suitable Arch Linux upstream mirrors in order of '
+ 'speed')
+ args.add_argument('-x', '--xml',
+ dest = 'xml',
+ action = 'store_true',
+ help = ('If specified, generate a config stub instead of a printed list of URLs'))
+ return(args)
+
+
def main():
+ args = parseArgs().parse_args()
r = Ranker()
r.extract_mirrors()
- import pprint
- pprint.pprint(r.raw_mirrors)
+ r.speedcheck()
+ if args.xml:
+ print(r.gen_xml())
+ else:
+ r.print()
return(None)
diff --git a/utils/find_fastest_upstream/centos.py b/utils/find_fastest_upstream/centos.py
index e69de29..973a6dc 100755
--- a/utils/find_fastest_upstream/centos.py
+++ b/utils/find_fastest_upstream/centos.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import datetime
+import io
+import re
+##
+import classes
+
+
+_proto_re = re.compile(r'^(?P<proto>https?)(?P<uri>.*)')
+
+
+class Ranker(classes.Ranker):
+ # https://lists.centos.org/pipermail/centos-mirror/2017-March/010312.html
+ mirrorlist_url = 'https://www.centos.org/download/full-mirrorlist.csv'
+ distro_name = 'centos'
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.get_mirrors()
+
+ def extract_mirrors(self, preferred_proto = 'rsync'):
+ preferred_proto = preferred_proto.lower()
+ if preferred_proto not in ('rsync', 'ftp'):
+ raise ValueError('Invalid preferred_proto; must be one of rsync or ftp')
+ non_preferred = ('rsync' if preferred_proto == 'ftp' else 'ftp')
+ c = csv.DictReader(io.StringIO(self.raw_html), )
+ for row in c:
+ if not row['Country'] or row['Country'].strip() == '':
+ continue
+ # GorRAM it, dudes. States are not countries.
+ country = row['Country'].strip()
+ region = row['Region'].strip()
+ if region == 'US':
+ country = region
+ if country != self.my_info['country']:
+ continue
+ for k, v in row.items():
+ if v.strip() == '':
+ row[k] = None
+ pref_url = row['{0} mirror link'.format(preferred_proto)]
+ nonpref_url = row['{0} mirror link'.format(non_preferred)]
+ if pref_url:
+ url = _proto_re.sub(r'{0}\g<uri>'.format(preferred_proto), pref_url)
+ else:
+ if not nonpref_url:
+ continue
+ url = _proto_re.sub(r'{0}\g<uri>'.format(non_preferred), nonpref_url)
+ self.raw_mirrors.append(row)
+ self.mirror_candidates.append(url)
+ return(None)
+
+
+def parseArgs():
+ args = argparse.ArgumentParser(description = 'Generate a list of suitable CentOS upstream mirrors in order of '
+ 'speed')
+ args.add_argument('-x', '--xml',
+ dest = 'xml',
+ action = 'store_true',
+ help = ('If specified, generate a config stub instead of a printed list of URLs'))
+ return(args)
+
+
+def main():
+ args = parseArgs().parse_args()
+ r = Ranker()
+ r.extract_mirrors()
+ r.speedcheck()
+ if args.xml:
+ print(r.gen_xml())
+ else:
+ r.print()
+ return(None)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/utils/find_fastest_upstream/classes.py b/utils/find_fastest_upstream/classes.py
index a189cde..308986c 100644
--- a/utils/find_fastest_upstream/classes.py
+++ b/utils/find_fastest_upstream/classes.py
@@ -1,22 +1,31 @@
import socket
import time
+from urllib import parse as urlparse
##
import requests
from bs4 import BeautifulSoup
+from lxml import etree
##
import constants
class Ranker(object):
mirrorlist_url = None # This is replaced by subclasses
+ distro_name = None
def __init__(self, parser = 'lxml', *args, **kwargs):
self.my_info = {}
self.raw_html = None
self.parser = parser
self.bs = None
+ self.req = None
self.get_myinfo()
+ # The native collection of mirror information.
self.raw_mirrors = []
+ # The list of URLs only of the above.
+ self.mirror_candidates = []
+ self.ranked_mirrors = {}
+ self.ranked_urls = {}
def extract_mirrors(self):
# A dummy func. This should be overridden by subclasses.
@@ -34,6 +43,79 @@ class Ranker(object):
req = requests.get(self.mirrorlist_url)
if not req.ok:
raise RuntimeError('Could not contact information gatherer')
- self.raw_html = req.content.decode('utf-8')
+ self.req = req
+ self.raw_html = self.req.content.decode('utf-8')
self.bs = BeautifulSoup(self.raw_html, self.parser)
return(None)
+
+ def speedcheck(self):
+ if not self.mirror_candidates:
+ self.extract_mirrors()
+ for url in self.mirror_candidates:
+ u = urlparse.urlparse(url)
+ sock = socket.socket()
+ sock.settimeout(7)
+ port = u.port
+ if not port:
+ port = constants.DEF_PORTS[u.scheme.lower()]
+ try:
+ start = time.perf_counter()
+ sock.connect((u.hostname, port))
+ conntime = time.perf_counter() - start # in seconds
+ sock.close()
+ del(sock)
+ except (socket.timeout, socket.error):
+ continue
+ # Skip the mirror if it has an exact time in the mirrors already.
+ # Sure, it's *very* unlikely, but best practice to do this.
+ if conntime in self.ranked_mirrors:
+ continue
+ mirror = {}
+ for a in ('path', 'port'):
+ mirror[a] = getattr(u, a, None)
+ mirror['domain'] = u.hostname.lower()
+ mirror['syncType'] = u.scheme.lower()
+ if not mirror['port']:
+ mirror['port'] = constants.DEF_PORTS[mirror['syncType']]
+ if mirror['path'] == '':
+ mirror['path'] = '/'
+ self.ranked_mirrors[conntime] = mirror
+ self.ranked_urls[conntime] = url
+ return(None)
+
+ def print(self):
+ if not self.ranked_mirrors:
+ self.speedcheck()
+ print('Mirrors in order of speed:\n')
+ for m in sorted(list(self.ranked_urls.keys())):
+ print('{0} # ({1} seconds to connect)'.format(self.ranked_urls[m], m))
+ return(None)
+
+ def gen_xml(self):
+ if not self.distro_name:
+ raise ValueError('This class must be subclassed to be useful')
+ if not self.ranked_mirrors:
+ self.speedcheck()
+ s = ('<?xml version="1.0" encoding="UTF-8" ?>'
+ '<mirror xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
+ 'xmlns="https://git.square-r00t.net/RepoMirror/" '
+ 'xsi:schemaLocation="https://git.square-r00t.net/RepoMirror/ '
+ 'http://schema.xml.r00t2.io/projects/repomirror.xsd">'
+ '</mirror>')
+ xml = etree.fromstring(s.encode('utf-8'))
+ distro = etree.Element('distro')
+ distro.attrib['name'] = self.distro_name
+ for m in sorted(list(self.ranked_mirrors.keys())):
+ mirror = self.ranked_mirrors[m]
+ distro.append(etree.Comment(' ({0} seconds to connect) '.format(m)))
+ u = etree.SubElement(distro, 'upstream')
+ for k, v in mirror.items():
+ e = etree.SubElement(u, k)
+ e.text = str(v)
+ xml.append(distro)
+ return(etree.tostring(xml,
+ pretty_print = True,
+ with_comments = True,
+ with_tail = True,
+ encoding = 'UTF-8',
+ xml_declaration = True).decode('utf-8'))
diff --git a/utils/find_fastest_upstream/constants.py b/utils/find_fastest_upstream/constants.py
index b2b0d2c..5411ade 100644
--- a/utils/find_fastest_upstream/constants.py
+++ b/utils/find_fastest_upstream/constants.py
@@ -1 +1,5 @@
MYINFO_URL = 'https://ipinfo.io'
+DEF_PORTS = {'ftp': 21,
+ 'http': 80,
+ 'https': 443,
+ 'rsync': 873}
diff --git a/utils/find_fastest_upstream/test.py b/utils/find_fastest_upstream/test.py
deleted file mode 100755
index 35e31ab..0000000
--- a/utils/find_fastest_upstream/test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env python3
-
-import requests
-from bs4 import BeautifulSoup
-
-country = 'US'
-url = 'https://www.archlinux.org/mirrors/status/tier/1/'
-
-req = requests.get(url)
-html = req.content.decode('utf-8')
-bs = BeautifulSoup(html, 'lxml')
-
-mirrors = bs.find('table', {'id': 'successful_mirrors'})
-header = mirrors.find('thead').find('tr')
-headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
-
-results = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in mirrors.find_all('tr')]
-
-import pprint
-pprint.pprint(results)