From caca1d1e84175c0749fe9248e26c0d02aa649157 Mon Sep 17 00:00:00 2001
From: brent s <r00t@square-r00t.net>
Date: Sat, 13 Jan 2018 02:10:15 -0500
Subject: [PATCH] new script, centos/isomirror_sort.py

---
 centos/isomirror_sort.py | 92 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 centos/isomirror_sort.py

diff --git a/centos/isomirror_sort.py b/centos/isomirror_sort.py
new file mode 100644
index 0000000..d787fd7
--- /dev/null
+++ b/centos/isomirror_sort.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3.6
+
+# requires python lxml module as well
+import os
+import socket
+import time
+from urllib.request import urlopen
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+
+# The page that contains the list of (authoritative ISO) mirrors
+URL = 'http://isoredirect.centos.org/centos/7/isos/x86_64/'
+# The formatting on the page is pretty simple - no divs, etc. - so we need to
+# blacklist some links we pull in.
+blacklisted_link_URLs = ('http://bittorrent.com/',
+                         'http://wiki.centos.org/AdditionalResources/Repositories')
+
+mirrors = {}
+
+dflt_ports = {'https': 443,  # unlikely. "HTTPS is currently not used for mirrors." per https://wiki.centos.org/HowTos/CreatePublicMirrors
+              'http': 80,  # most likely.
+              'ftp': 21,
+              'rsync': 873}
+
+def getMirrors():
+    mirrors = []
+    with urlopen(URL) as u:
+        pg_src = u.read().decode('utf-8')
+    soup = BeautifulSoup(pg_src, 'lxml')
+    for tag in soup.find_all('br')[4].next_siblings:
+        if tag.name == 'a' and tag['href'] not in blacklisted_link_URLs:
+            mirrors.append(tag['href'].strip())
+    return(mirrors)
+
+def getHosts(mirror):
+    port = None
+    fqdn = None
+    login = ''
+    # "mirror" should be a base URI of the CentOS mirror path.
+    # mirrors.centos.org is pointless to use for this!
+    #url = os.path.join(mirror, 'sha256sum.txt.asc')
+    uri = urlparse(mirror)
+    spl_dom = uri.netloc.split(':')
+    if len(spl_dom) >= 2:  # more complex URI
+        if len(spl_dom) == 2:  # probably domain:port?
+            try:
+                port = int(spl_dom[-1:])
+            except ValueError:  # ooookay, so it's not domain:port, it's a user:pass@
+                if '@' in uri.netloc:
+                    auth = uri.netloc.split('@')
+                    fqdn = auth[1]
+                    login = auth[0] + '@'
+        elif len(spl_dom) > 2:  # even more complex URI, which ironically makes parsing easier
+            auth = uri.netloc.split('@')
+            fqdn = spl_dom[1].split('@')[1]
+            port = int(spl_dom[-1:])
+            login = auth[0] + '@'
+    # matches missing values and simple URI. like, 99%+ of mirror URIs being passed.
+    if not fqdn:
+        fqdn = uri.netloc
+    if not port:
+        port = dflt_ports[uri.scheme]
+    mirrors[fqdn] = {'proto': uri.scheme,
+                     'port': port,
+                     'path': uri.path,
+                     'auth': login}
+    return()
+
+def getSpeeds():
+    for fqdn in mirrors.keys():
+        start = time.time()
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.connect((fqdn, mirrors[fqdn]['port']))
+        mirrors[fqdn]['time'] = time.time() - start
+        sock.close()
+    return()
+
+def main():
+    for m in getMirrors():
+        getHosts(m)
+    getSpeeds()
+    ranking = sorted(mirrors.keys(), key = lambda k: (mirrors[k]['time']))
+    for i in ranking:
+        str_port = ':' + str(mirrors[i]['port'])
+        if mirrors[i]['port'] in dflt_ports.values():
+            str_port = ''
+        print('{proto}://{auth}{0}{p}{path}'.format(i,
+                                                    **mirrors[i],
+                                                    p = str_port))
+
+if __name__ == '__main__':
+    main()