summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbrent s <r00t@square-r00t.net>2020-07-14 17:13:14 -0400
committerbrent s <r00t@square-r00t.net>2020-07-14 17:13:14 -0400
commitc4bb612381315516d3dd14bb6a9cb34ed9f0d09d (patch)
treee30ecc0ccac15182a4bafc44b82e735e0a5a4629
parent289c2711b8aba2a455d69de0b2ac79970fab89b5 (diff)
downloadOpTools-c4bb612381315516d3dd14bb6a9cb34ed9f0d09d.tar.xz
adding get_title
-rwxr-xr-xnet/get_title.py78
1 files changed, 78 insertions, 0 deletions
diff --git a/net/get_title.py b/net/get_title.py
new file mode 100755
index 0000000..dcdebf2
--- /dev/null
+++ b/net/get_title.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+import argparse
+try:
+ import requests as handler
+ has_req = True
+except ImportError:
+ from urllib.request import urlopen as handler
+ has_req = False
+try:
+ import lxml
+ parser = 'lxml'
+except ImportError:
+ parser = 'html.parser'
+from bs4 import BeautifulSoup
+
+
+def_elem = 'title'
+
+
+class InfoScraper(object):
+ def __init__(self, url, elem = def_elem, *args, **kwargs):
+ self.url = url
+ self.elem = elem
+ self.raw = None
+ self.str = None
+ self.soup = None
+ self._get_page()
+
+ def _get_page(self):
+ if has_req:
+ self.raw = handler.get(self.url).content
+ else:
+ with handler(self.url) as fh:
+ self.raw = fh.read()
+ try:
+ self.str = self.raw.decode('utf-8')
+ except Exception:
+ pass
+ self.soup = BeautifulSoup(self.str, features = parser)
+ return(None)
+
+ def find(self):
+ rtrn = [e for e in self.soup.find_all(self.elem)]
+ if len(rtrn) == 1:
+ rtrn = rtrn[0]
+ elif len(rtrn) == 0:
+ rtrn = None
+ return(rtrn)
+
+
+
+def parseArgs():
+ args = argparse.ArgumentParser(description = 'Get quick information from a URL at a glance')
+ args.add_argument('-e', '--elem',
+ default = def_elem,
+ help = ('The element(s) you want to scrape from the page. This is likely just going to be "{0}" (the default)').format(def_elem))
+ args.add_argument('url',
+ metavar = 'URL',
+ help = ('The URL to parse. It may need to be quoted or escaped depending on the URL and what shell you\'re using'))
+ return(args)
+
+
+def main():
+ args = parseArgs().parse_args()
+ i = InfoScraper(**vars(args))
+ rslts = i.find()
+ if isinstance(rslts, list):
+ for i in rslts:
+ print(i.text)
+ else:
+ print(rslts.text)
+ return(None)
+
+
+if __name__ == '__main__':
+ main()
+