summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbrent s <bts@square-r00t.net>2020-06-14 00:53:12 -0400
committerbrent s <bts@square-r00t.net>2020-06-14 00:53:12 -0400
commit2ba79cd801f3a87fb6f785a8e9fdf0f6a2508cb2 (patch)
tree2efcb7df2a232c3c08f5bac1d9b0e3d01f1b69ce
parent552611174365d2fd5ce1e6bb1f5abebfd8a09a10 (diff)
downloadRepoMirror-2ba79cd801f3a87fb6f785a8e9fdf0f6a2508cb2.tar.xz
about to change up a lot of stuff...
-rw-r--r--sample.config.xml86
-rwxr-xr-xutils/find_fastest_upstream/archlinux.py59
-rwxr-xr-xutils/find_fastest_upstream/centos.py0
-rw-r--r--utils/find_fastest_upstream/classes.py39
-rw-r--r--utils/find_fastest_upstream/constants.py1
-rwxr-xr-xutils/find_fastest_upstream/test.py20
6 files changed, 192 insertions, 13 deletions
diff --git a/sample.config.xml b/sample.config.xml
index c9ea1e2..4e4c692 100644
--- a/sample.config.xml
+++ b/sample.config.xml
@@ -7,56 +7,107 @@
xsi:schemaLocation="https://git.square-r00t.net/RepoMirror/ http://schema.xml.r00t2.io/projects/repomirror.xsd">
<distro name="arch">
<!--
+ If provided (and the sync script is running as the root user), the files/directories can be chowned to the
+ provided user/group. Otherwise they'll be owned by whatever user the script is running as (and its primary group).
+ -->
+ <owner>
+ <user>root</user>
+ <group>root</group>
+ </owner>
+ <!--
The local path to where the hierarchy/files should be synced to.
-->
<dest>/srv/repos/arch/.</dest>
<!--
- The local file to update with a timestamp with the last time we checked for updates.
+ The local file to update with a timestamp with the last time we *checked* for updates.
+ If not provided, don't update a file (NOT recommended!).
+ It may or may not be optional; check with the spec for mirroring for the specified distro.
+ If the timeFormat attribute is provided, write the timestamp format in the specified format.
+ See the following for details:
+ * https://docs.python.org/library/datetime.html#strftime-and-strptime-format-codes
+ * https://strftime.org/
+ The default is to use a regular UNIX Epoch integer (e.g. June 13, 2020 5:03:53 PM UTC => 1592067833).
+ This can be manually specified by the special string "UNIX_EPOCH".
+ Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with microseconds.
+ e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777
-->
- <lastLocalCheck>/srv/http/arch.lastcheck</lastLocalCheck>
+ <lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/arch.lastcheck</lastLocalCheck>
<!--
- The file to update with a timestamp with the last time we synced from our upstream.
+ The file to update with a timestamp with the last time we *synced from our upstream*.
+ If not provided, don't update a file (NOT recommended!).
+ It may or may not be optional; check with the spec for mirroring for the specified distro.
+ If not provided, don't update a file (NOT recommended!).
+ It takes the same optional attribute "timeFormat" as above, with the same behaviour.
-->
- <lastLocalSync>/srv/http/arch.lastsync</lastLocalSync>
+ <lastLocalSync timeFormat="UNIX_EPOCH">/srv/repos/arch/lastsync</lastLocalSync>
<!--
The path to a file on the upstream(s) that gives a time when it last updated.
+ The optional timeFormat attribute behavior is the same as above.
+ If neither this nor lastRemoteSync is provided, a sync will be attempted regardless of when the last one was
+ attempted.
-->
- <lastRemoteUpdate>/lastupdate</lastRemoteUpdate>
+ <lastRemoteUpdate timeFormat="UNIX_EPOCH">/lastupdate</lastRemoteUpdate>
<!--
The path to a file on the upstream(s) that gives a time when it last synced from its upstream.
+ The optional timeFormat attribute behavior is the same as above.
+ If neither this nor lastRemoteUpdate is provided, a sync will be attempted regardless of when the last one was
+ attempted.
-->
- <lastRemoteSync>/lastsync</lastRemoteSync>
+ <lastRemoteSync timeFormat="UNIX_EPOCH">/lastsync</lastRemoteSync>
<!--
The path that must be currently mounted for sync to proceed.
+ This is required.
-->
<mountCheck>/</mountCheck>
<!--
- The speed to cap socket bandwidth at (in KiB). Decimals are okay.
+ You cannot reliably use two dashes in XML strings, so this is a workaround.
+ The following is only used for rsync upstreams and is optional. The default is just archive and delete-after.
+ If arguments are provided, the defaults are overwritten so if you need the above, be sure to specify them.
+ See the rsync man page (rsync(1)) for more details and a listing of supported flags on your system.
-->
- <bwlimit>7000</bwlimit>
+ <rsyncArgs>
+ <!--
+ A "long" option (two hyphens).
+ -->
+ <long>archive</long>
+ <long>delete-after</long>
+ <!--
+ An argument with a value (info=2).
+ -->
+ <long value="2">info</long>
+ <!--
+ A "short" option (single hyphen).
+ -->
+ <short>c</short><!-- checksum -->
+ </rsyncArgs>
<upstream>
<!--
The following example uses "rsync://arch.mirror.constant.com/archlinux/"
(https://www.archlinux.org/mirrors/constant.com/1008/)
-->
<!--
- One of:
+ Required; one of:
* rsync
* ftp
-->
<syncType>rsync</syncType>
<!--
- ONLY the domain goes here.
+ Required; ONLY the domain goes here.
-->
<domain>arch.mirror.constant.com</domain>
<!--
- If not specified,the protocol's default port will be used.
+ Optional; if not specified,the protocol's default port will be used.
-->
<port>873</port>
<!--
- The *remote* path part of the URI. The leading / is necessary. A trailing one will be assumed.
+ Required; the *remote* path part of the URI. The leading / is necessary. A trailing one will be assumed.
-->
<path>/archlinux/</path>
+ <!--
+ The speed to cap socket bandwidth at (in KiB). Decimals are okay.
+ Only valid for rsync; ignored for FTP. If not provided, the default is to not throttle.
+ -->
+ <bwlimit>7000</bwlimit>
</upstream>
<!--
Multiple upstreams can be specified. They are tried in order specified and if connection fails or times out,
@@ -77,5 +128,14 @@
<path>/distros/archlinux/</path>
</upstream>
</distro>
- <distro name="centos"/>
+ <distro name="centos">
+ <upstream>
+ <syncType>rsync</syncType>
+ <domain>mirrors.rit.edu</domain>
+ <path>/centos/</path>
+ </upstream>
+ <dest>/srv/repos/arch/.</dest>
+ <lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/centos.lastcheck</lastLocalCheck>
+ <lastLocalSync timeFormat="UNIX_EPOCH">/srv/repos/arch/lastsync</lastLocalSync>
+ </distro>
</mirror>
diff --git a/utils/find_fastest_upstream/archlinux.py b/utils/find_fastest_upstream/archlinux.py
new file mode 100755
index 0000000..32fc1ce
--- /dev/null
+++ b/utils/find_fastest_upstream/archlinux.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+import datetime
+import re
+##
+import iso3166
+##
+import classes
+
+
+_strip_re = re.compile(r'^\s*(?P<num>[0-9.]+).*$')
+
+
+class Ranker(classes.Ranker):
+ mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/'
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.get_mirrors()
+ self.mycountry = iso3166.countries_by_alpha2[self.my_info['country']].name
+
+ def extract_mirrors(self):
+ # Limit to only successful mirrors.
+ mirrors = self.bs.find('table', {'id': 'successful_mirrors'})
+ # Ayyy, thanks dude.
+ # Modified from https://stackoverflow.com/a/56835562/733214.
+ header = mirrors.find('thead').find('tr')
+ headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
+ raw_rows = mirrors.find_all('tr')
+ # rows = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
+ rows = [{headers[i]: cell for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
+ for r in rows:
+ for k, v in r.items():
+ print(v)
+ if k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
+ r[k] = float(_strip_re.sub(r'\g<num>', v.text).strip())
+ elif k == 'μ Delay (hh:mm)':
+ # HOO boy. Wish they just did it in seconds.
+ # elif k == 'Country':
+ self.raw_mirrors.append(r)
+ # for row in rows:
+ # if not row:
+ # continue
+ # for k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
+ # row[k] = float(_strip_re.sub(r'\g<num>', row[k]).strip())
+
+ return(None)
+
+
+def main():
+ r = Ranker()
+ r.extract_mirrors()
+ import pprint
+ pprint.pprint(r.raw_mirrors)
+ return(None)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/utils/find_fastest_upstream/centos.py b/utils/find_fastest_upstream/centos.py
new file mode 100755
index 0000000..e69de29
--- /dev/null
+++ b/utils/find_fastest_upstream/centos.py
diff --git a/utils/find_fastest_upstream/classes.py b/utils/find_fastest_upstream/classes.py
new file mode 100644
index 0000000..a189cde
--- /dev/null
+++ b/utils/find_fastest_upstream/classes.py
@@ -0,0 +1,39 @@
+import socket
+import time
+##
+import requests
+from bs4 import BeautifulSoup
+##
+import constants
+
+
+class Ranker(object):
+ mirrorlist_url = None # This is replaced by subclasses
+
+ def __init__(self, parser = 'lxml', *args, **kwargs):
+ self.my_info = {}
+ self.raw_html = None
+ self.parser = parser
+ self.bs = None
+ self.get_myinfo()
+ self.raw_mirrors = []
+
+ def extract_mirrors(self):
+ # A dummy func. This should be overridden by subclasses.
+ pass
+ return(None)
+
+ def get_myinfo(self):
+ req = requests.get(constants.MYINFO_URL)
+ if not req.ok:
+ raise RuntimeError('Could not contact information gatherer')
+ self.my_info = req.json()
+ return(None)
+
+ def get_mirrors(self):
+ req = requests.get(self.mirrorlist_url)
+ if not req.ok:
+ raise RuntimeError('Could not contact information gatherer')
+ self.raw_html = req.content.decode('utf-8')
+ self.bs = BeautifulSoup(self.raw_html, self.parser)
+ return(None)
diff --git a/utils/find_fastest_upstream/constants.py b/utils/find_fastest_upstream/constants.py
new file mode 100644
index 0000000..b2b0d2c
--- /dev/null
+++ b/utils/find_fastest_upstream/constants.py
@@ -0,0 +1 @@
+MYINFO_URL = 'https://ipinfo.io'
diff --git a/utils/find_fastest_upstream/test.py b/utils/find_fastest_upstream/test.py
new file mode 100755
index 0000000..35e31ab
--- /dev/null
+++ b/utils/find_fastest_upstream/test.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+import requests
+from bs4 import BeautifulSoup
+
+country = 'US'
+url = 'https://www.archlinux.org/mirrors/status/tier/1/'
+
+req = requests.get(url)
+html = req.content.decode('utf-8')
+bs = BeautifulSoup(html, 'lxml')
+
+mirrors = bs.find('table', {'id': 'successful_mirrors'})
+header = mirrors.find('thead').find('tr')
+headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
+
+results = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in mirrors.find_all('tr')]
+
+import pprint
+pprint.pprint(results)