about to change up a lot of stuff...

This commit is contained in:
brent s. 2020-06-14 00:53:12 -04:00
parent 5526111743
commit 2ba79cd801
Signed by: bts
GPG Key ID: 8C004C2F93481F6B
6 changed files with 192 additions and 13 deletions

View File

@ -6,57 +6,108 @@
xmlns="https://git.square-r00t.net/RepoMirror/" xmlns="https://git.square-r00t.net/RepoMirror/"
xsi:schemaLocation="https://git.square-r00t.net/RepoMirror/ http://schema.xml.r00t2.io/projects/repomirror.xsd"> xsi:schemaLocation="https://git.square-r00t.net/RepoMirror/ http://schema.xml.r00t2.io/projects/repomirror.xsd">
<distro name="arch"> <distro name="arch">
<!--
If provided (and the sync script is running as the root user), the files/directories can be chowned to the
provided user/group. Otherwise they'll be owned by whatever user the script is running as (and its primary group).
-->
<owner>
<user>root</user>
<group>root</group>
</owner>
<!-- <!--
The local path to where the hierarchy/files should be synced to. The local path to where the hierarchy/files should be synced to.
--> -->
<dest>/srv/repos/arch/.</dest> <dest>/srv/repos/arch/.</dest>
<!-- <!--
The local file to update with a timestamp with the last time we checked for updates. The local file to update with a timestamp with the last time we *checked* for updates.
If not provided, don't update a file (NOT recommended!).
It may or may not be optional; check with the spec for mirroring for the specified distro.
If the timeFormat attribute is provided, write the timestamp format in the specified format.
See the following for details:
* https://docs.python.org/library/datetime.html#strftime-and-strptime-format-codes
* https://strftime.org/
The default is to use a regular UNIX Epoch integer (e.g. June 13, 2020 5:03:53 PM UTC => 1592067833).
This can be manually specified by the special string "UNIX_EPOCH".
Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with microseconds.
e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777
--> -->
<lastLocalCheck>/srv/http/arch.lastcheck</lastLocalCheck> <lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/arch.lastcheck</lastLocalCheck>
<!-- <!--
The file to update with a timestamp with the last time we synced from our upstream. The file to update with a timestamp with the last time we *synced from our upstream*.
If not provided, don't update a file (NOT recommended!).
It may or may not be optional; check with the spec for mirroring for the specified distro.
If not provided, don't update a file (NOT recommended!).
It takes the same optional attribute "timeFormat" as above, with the same behaviour.
--> -->
<lastLocalSync>/srv/http/arch.lastsync</lastLocalSync> <lastLocalSync timeFormat="UNIX_EPOCH">/srv/repos/arch/lastsync</lastLocalSync>
<!-- <!--
The path to a file on the upstream(s) that gives a time when it last updated. The path to a file on the upstream(s) that gives a time when it last updated.
The optional timeFormat attribute behavior is the same as above.
If neither this nor lastRemoteSync is provided, a sync will be attempted regardless of when the last one was
attempted.
--> -->
<lastRemoteUpdate>/lastupdate</lastRemoteUpdate> <lastRemoteUpdate timeFormat="UNIX_EPOCH">/lastupdate</lastRemoteUpdate>
<!-- <!--
The path to a file on the upstream(s) that gives a time when it last synced from its upstream. The path to a file on the upstream(s) that gives a time when it last synced from its upstream.
The optional timeFormat attribute behavior is the same as above.
If neither this nor lastRemoteUpdate is provided, a sync will be attempted regardless of when the last one was
attempted.
--> -->
<lastRemoteSync>/lastsync</lastRemoteSync> <lastRemoteSync timeFormat="UNIX_EPOCH">/lastsync</lastRemoteSync>
<!-- <!--
The path that must be currently mounted for sync to proceed. The path that must be currently mounted for sync to proceed.
This is required.
--> -->
<mountCheck>/</mountCheck> <mountCheck>/</mountCheck>
<!-- <!--
The speed to cap socket bandwidth at (in KiB). Decimals are okay. You cannot reliably use two dashes in XML strings, so this is a workaround.
The following is only used for rsync upstreams and is optional. The default is just archive and delete-after.
If arguments are provided, the defaults are overwritten so if you need the above, be sure to specify them.
See the rsync man page (rsync(1)) for more details and a listing of supported flags on your system.
--> -->
<bwlimit>7000</bwlimit> <rsyncArgs>
<!--
A "long" option (two hyphens).
-->
<long>archive</long>
<long>delete-after</long>
<!--
An argument with a value (info=2).
-->
<long value="2">info</long>
<!--
A "short" option (single hyphen).
-->
<short>c</short><!-- checksum -->
</rsyncArgs>
<upstream> <upstream>
<!-- <!--
The following example uses "rsync://arch.mirror.constant.com/archlinux/" The following example uses "rsync://arch.mirror.constant.com/archlinux/"
(https://www.archlinux.org/mirrors/constant.com/1008/) (https://www.archlinux.org/mirrors/constant.com/1008/)
--> -->
<!-- <!--
One of: Required; one of:
* rsync * rsync
* ftp * ftp
--> -->
<syncType>rsync</syncType> <syncType>rsync</syncType>
<!-- <!--
ONLY the domain goes here. Required; ONLY the domain goes here.
--> -->
<domain>arch.mirror.constant.com</domain> <domain>arch.mirror.constant.com</domain>
<!-- <!--
If not specified,the protocol's default port will be used. Optional; if not specified,the protocol's default port will be used.
--> -->
<port>873</port> <port>873</port>
<!-- <!--
The *remote* path part of the URI. The leading / is necessary. A trailing one will be assumed. Required; the *remote* path part of the URI. The leading / is necessary. A trailing one will be assumed.
--> -->
<path>/archlinux/</path> <path>/archlinux/</path>
<!--
The speed to cap socket bandwidth at (in KiB). Decimals are okay.
Only valid for rsync; ignored for FTP. If not provided, the default is to not throttle.
-->
<bwlimit>7000</bwlimit>
</upstream> </upstream>
<!-- <!--
Multiple upstreams can be specified. They are tried in order specified and if connection fails or times out, Multiple upstreams can be specified. They are tried in order specified and if connection fails or times out,
@ -77,5 +128,14 @@
<path>/distros/archlinux/</path> <path>/distros/archlinux/</path>
</upstream> </upstream>
</distro> </distro>
<distro name="centos"/> <distro name="centos">
<upstream>
<syncType>rsync</syncType>
<domain>mirrors.rit.edu</domain>
<path>/centos/</path>
</upstream>
<dest>/srv/repos/arch/.</dest>
<lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/centos.lastcheck</lastLocalCheck>
<lastLocalSync timeFormat="UNIX_EPOCH">/srv/repos/arch/lastsync</lastLocalSync>
</distro>
</mirror> </mirror>

View File

@ -0,0 +1,59 @@
#!/usr/bin/env python3

import datetime
import re
##
import iso3166
##
import classes


_strip_re = re.compile(r'^\s*(?P<num>[0-9.]+).*$')


class Ranker(classes.Ranker):
mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/'

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.get_mirrors()
self.mycountry = iso3166.countries_by_alpha2[self.my_info['country']].name

def extract_mirrors(self):
# Limit to only successful mirrors.
mirrors = self.bs.find('table', {'id': 'successful_mirrors'})
# Ayyy, thanks dude.
# Modified from https://stackoverflow.com/a/56835562/733214.
header = mirrors.find('thead').find('tr')
headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
raw_rows = mirrors.find_all('tr')
# rows = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
rows = [{headers[i]: cell for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
for r in rows:
for k, v in r.items():
print(v)
if k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
r[k] = float(_strip_re.sub(r'\g<num>', v.text).strip())
elif k == 'μ Delay (hh:mm)':
# HOO boy. Wish they just did it in seconds.
# elif k == 'Country':
self.raw_mirrors.append(r)
# for row in rows:
# if not row:
# continue
# for k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
# row[k] = float(_strip_re.sub(r'\g<num>', row[k]).strip())

return(None)


def main():
r = Ranker()
r.extract_mirrors()
import pprint
pprint.pprint(r.raw_mirrors)
return(None)


if __name__ == '__main__':
main()

View File

View File

@ -0,0 +1,39 @@
import socket
import time
##
import requests
from bs4 import BeautifulSoup
##
import constants


class Ranker(object):
mirrorlist_url = None # This is replaced by subclasses

def __init__(self, parser = 'lxml', *args, **kwargs):
self.my_info = {}
self.raw_html = None
self.parser = parser
self.bs = None
self.get_myinfo()
self.raw_mirrors = []

def extract_mirrors(self):
# A dummy func. This should be overridden by subclasses.
pass
return(None)

def get_myinfo(self):
req = requests.get(constants.MYINFO_URL)
if not req.ok:
raise RuntimeError('Could not contact information gatherer')
self.my_info = req.json()
return(None)

def get_mirrors(self):
req = requests.get(self.mirrorlist_url)
if not req.ok:
raise RuntimeError('Could not contact information gatherer')
self.raw_html = req.content.decode('utf-8')
self.bs = BeautifulSoup(self.raw_html, self.parser)
return(None)

View File

@ -0,0 +1 @@
MYINFO_URL = 'https://ipinfo.io'

View File

@ -0,0 +1,20 @@
#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup

country = 'US'
url = 'https://www.archlinux.org/mirrors/status/tier/1/'

req = requests.get(url)
html = req.content.decode('utf-8')
bs = BeautifulSoup(html, 'lxml')

mirrors = bs.find('table', {'id': 'successful_mirrors'})
header = mirrors.find('thead').find('tr')
headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]

results = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in mirrors.find_all('tr')]

import pprint
pprint.pprint(results)