checking irssi log parser rewrite progress.... got a TON done on it
This commit is contained in:
parent
aac603b4ee
commit
9f1515b96c
@ -1,179 +1,275 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
## REFERENCE ##
|
||||
# https://github.com/myano/jenni/wiki/IRC-String-Formatting
|
||||
# https://www.mirc.com/colors.html
|
||||
# https://en.wikipedia.org/wiki/ANSI_escape_code
|
||||
# https://github.com/shabble/irssi-docs/wiki/Formats#Colourising-Text-in-IRC-Messages
|
||||
|
||||
|
||||
import argparse
|
||||
import curses
|
||||
import os
|
||||
import pprint
|
||||
import re
|
||||
import sys
|
||||
try:
|
||||
import magic
|
||||
has_magic = True
|
||||
except ImportError:
|
||||
print(('Warning: you do not have the magic module installed ' +
|
||||
'(you can install it via "pip3 install --user python-magic"). ' +
|
||||
'Automatic log decompression will not work.'))
|
||||
print('Warning: you do not have the magic module installed (you can '
|
||||
'install it via "pip3 install --user file-magic"). Automatic log '
|
||||
'decompression will not work.')
|
||||
has_magic = False
|
||||
|
||||
class logParser(object):
|
||||
# This is a map to determine which module to use to decompress,
|
||||
# if we should.
|
||||
cmprsn_map = {'text/plain': None, # Plain ol' text
|
||||
# Sometimes the formatting with color gives this
|
||||
'application/octet-stream': None,
|
||||
'application/x-bzip2': 'bz2', # Bzip2
|
||||
'application/x-gzip': 'gzip', # Gzip
|
||||
'application/x-xz': 'lzma'} # XZ
|
||||
|
||||
# irssi/mIRC to ANSI
|
||||
# Split into 3 maps (truecolor will be populated later, currently uses 8-bit):
|
||||
# - 8 (3/4 bit color values, 8 colors)
|
||||
# - 256 (8-bit, 256 colors)
|
||||
# - 'truecolor' (24-bit, ISO-8613-3, 16777216 colors)
|
||||
# Keys are the mIRC color value
|
||||
# Values are:
|
||||
# - 8: tuple for ANSI fg and bg values
|
||||
# - 256: single value (same number is used for fg and bg)
|
||||
# - 'truecolor': tuple of (R#, G#, B#) (same number is used for fg and bg)
|
||||
# In addition, all three have the following:
|
||||
# - ansi_wrap: the string formatter.
|
||||
# fg: foreground color
|
||||
# bg: background color (if present)
|
||||
# They are concatted together in that order.
|
||||
## https://en.wikipedia.org/wiki/ANSI_escape_code#3/4_bit
|
||||
colormap = {8: {'0': ('97', '107'),
|
||||
'1': ('30', '40'),
|
||||
'2': ('34', '44'),
|
||||
'3': ('32', '42'),
|
||||
'4': ('91', '101'),
|
||||
'5': ('31', '41'),
|
||||
'6': ('35', '45'),
|
||||
'7': ('33', '43'),
|
||||
'8': ('93', '103'),
|
||||
'9': ('92', '102'),
|
||||
'10': ('36', '46'),
|
||||
'11': ('96', '106'),
|
||||
'12': ('94', '104'),
|
||||
'13': ('95', '105'),
|
||||
'14': ('90', '100'),
|
||||
'15': ('37', '47'),
|
||||
'ansi_wrap': {'fg': '\x1b[{0}',
|
||||
'bg': ';{0}m'}},
|
||||
## https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit
|
||||
256: {'0': '15',
|
||||
'1': '0',
|
||||
'2': '19',
|
||||
'3': '34',
|
||||
'4': '196',
|
||||
'5': '52',
|
||||
'6': '90',
|
||||
'7': '208',
|
||||
'8': '226',
|
||||
'9': '82',
|
||||
'10': '37',
|
||||
'11': '51',
|
||||
'12': '21',
|
||||
'13': '199',
|
||||
'14': '241',
|
||||
'15': '252',
|
||||
'ansi_wrap': {'fg': '\x1b[38;5;{0}m',
|
||||
'bg': '\x1b[48;5;{0}m'}},
|
||||
## https://en.wikipedia.org/wiki/ANSI_escape_code#24-bit
|
||||
# (can just use mIRC's R,G,B)
|
||||
'truecolor': {'0': ('255', '255', '255'),
|
||||
'1': ('0', '0', '0'),
|
||||
'2': ('0', '0', '127'),
|
||||
'3': ('0', '147', '0'),
|
||||
'4': ('255', '0', '0'),
|
||||
'5': ('127', '0', '0'),
|
||||
'6': ('156', '0', '156'),
|
||||
'7': ('252', '127', '0'),
|
||||
'8': ('255', '255', '0'),
|
||||
'9': ('0', '252', '0'),
|
||||
'10': ('0', '147', '147'),
|
||||
'11': ('0', '255', '255'),
|
||||
'12': ('0', '0', '252'),
|
||||
'13': ('255', '0', '255'),
|
||||
'14': ('127', '127', '127'),
|
||||
'15': ('210', '210', '210'),
|
||||
'ansi_wrap': {'fg': '\x1b[38;2;{0[0]},{0[1]},{0[2]}',
|
||||
'bg': '\x1b[48;2;{0[0]},'
|
||||
'{0[1]},{0[2]}'}}}
|
||||
|
||||
def get_palette():
|
||||
# Return 8, 256, or 'truecolor'
|
||||
colorterm = os.getenv('COLORTERM', None)
|
||||
if colorterm == 'truecolor':
|
||||
# TODO: 24-bit support (16777216 colors) instead of 8-bit.
|
||||
# See note above.
|
||||
#return('truecolor')
|
||||
return(256)
|
||||
else:
|
||||
curses.initscr()
|
||||
curses.start_color()
|
||||
c = curses.COLORS
|
||||
curses.endwin()
|
||||
return(c)
|
||||
|
||||
def color_converter(data_in, palette_map):
|
||||
# Only used if logParser().args['color'] = True
|
||||
# Convert mIRC/Irssi color coding to ANSI color codes.
|
||||
color_ptrn = re.compile('\x03[0-9]{1,2}(,[0-9]{1,2})?')
|
||||
_colors = colormap[palette_map]
|
||||
# the value to reset formatting to the terminal default
|
||||
reset_char = '\x1b[0m'
|
||||
# the value to reset the foreground text
|
||||
def_fg = '\x1b[39m'
|
||||
# the value to reset the background text
|
||||
def_bg = '\x1b[49m'
|
||||
regex = {'nick': re.compile('\x04[89]/'),
|
||||
'bold': re.compile('\x02'),
|
||||
## Doublecheck this. what is the significance of \x04(.*) chars?
|
||||
'reset': re.compile('\x04(g|c|[389;]/?|e|>)?/?'),
|
||||
'color_clear': re.compile('\x0f(g|c|[389;]/?|e)?/?')}
|
||||
# A sub-function that generates the replacement characters.
|
||||
def _repl(ch_in):
|
||||
_ch = ch_in.group().lstrip('\x03')
|
||||
_chars = [i.strip() for i in _ch.split(',', 1)]
|
||||
if len(_chars) == 1:
|
||||
ch_out = _colors['ansi_wrap']['fg'].format(_chars[0])
|
||||
elif len(_chars) == 2:
|
||||
ch_out = (_colors['ansi_wrap']['fg'].format(_chars[0]) +
|
||||
_colors['ansi_wrap']['bg'].format(_chars[1]))
|
||||
else:
|
||||
raise RuntimeError('Parsing error! "{0}"'.format(ch_in))
|
||||
return(ch_out)
|
||||
data = data_in.splitlines()
|
||||
for idx, line in enumerate(data[:]):
|
||||
# Get some preliminary replacements out of the way.
|
||||
line = regex['nick'].sub(' ', line, 1)
|
||||
line = regex['reset'].sub(reset_char, line)
|
||||
line = regex['color_clear'].sub(def_fg + def_bg, line)
|
||||
# TODO: use ptrn.sub(_repl, line) instead since that works and
|
||||
# this does not
|
||||
# First set is text color
|
||||
# Second set, if present, is bg color
|
||||
line = color_ptrn.sub(_repl, line)
|
||||
data[idx] = line
|
||||
return('\n'.join(data))
|
||||
|
||||
def plain_stripper(data_in):
|
||||
# Strip to plaintext only.
|
||||
data = data_in.splitlines()
|
||||
ptrns = [re.compile('\x04(g|c|[389;]/?|e|>)/?'),
|
||||
re.compile('((\x03)\d\d?,\d\d?|(\x03)\d\d?|[\x01-\x1F])')]
|
||||
for idx, line in enumerate(data[:]):
|
||||
# This cleans the nick field
|
||||
l = re.sub('\x04[89]/', ' ', line, 1)
|
||||
# And these clean the actual chat messages
|
||||
for p in ptrns:
|
||||
l = p.sub('', l)
|
||||
data[idx] = l
|
||||
return('\n'.join(data))
|
||||
|
||||
class irssiLogParser(object):
|
||||
def __init__(self, args, data = None):
|
||||
# We'll need these accessible across the entire class.
|
||||
self.args = args
|
||||
# If specified, self.data takes precedence over self.args['logfile']
|
||||
# (if it was specified).
|
||||
self.data = data
|
||||
self.raw = data
|
||||
self.has_html = False
|
||||
# This is a map to determine which module to use to decompress,
|
||||
# if we should.
|
||||
self.cmprsn_map = {'text/plain': None, # Plain ol' text
|
||||
'application/octet-stream': None, # Sometimes the formatting with color gives this
|
||||
'application/x-bzip2': 'bz2', # Bzip2
|
||||
'application/x-gzip': 'gzip', # Gzip
|
||||
'application/x-xz': 'lzma'} # XZ
|
||||
# I though y'all liked GUIs.
|
||||
# ANSI, which is interpreted by the shell.
|
||||
# Only used if args['color'] = True
|
||||
self.ansi_prefix = '\033['
|
||||
# irssi to ANSI
|
||||
self.colormap = {'00': '1;37m', # White
|
||||
'01': '0;30m', # Black
|
||||
'02': '0;34m', # Blue
|
||||
'03': '0;32m', # Green
|
||||
'04': '1;31m', # Light Red
|
||||
'05': '0;31m', # Red
|
||||
'06': '0;35m', # Magenta (translated as Purple)
|
||||
'07': '0;33m', # Orange (translated as Brown)
|
||||
'08': '1;33m', # Yellow
|
||||
'09': '1:32m', # Light Green
|
||||
'10': '0;36m', # Cyan
|
||||
'11': '1;36m', # Light Cyan
|
||||
'12': '1;34m', # Light Blue
|
||||
'13': '1;35m', # Light Magenta (translated as Light Purple)
|
||||
'14': '0;37m', # Gray
|
||||
'15': '1;37'} # Light Gray (translated as White)
|
||||
# The full, interpreted path.
|
||||
if 'logfile' in self.args.keys():
|
||||
self.args['logfile'] = os.path.abspath(os.path.expanduser(self.args['logfile']))
|
||||
if not self.data:
|
||||
self.getLog()
|
||||
else:
|
||||
self.data = self.data.decode('utf-8').splitlines()
|
||||
self.decompress = None
|
||||
if has_magic:
|
||||
# Determine what decompressor to use, if we need to.
|
||||
_mime = magic.detect_from_content(self.data).mime_type
|
||||
self.decompress = self.cmprsn_map[_mime]
|
||||
if self.args['html']:
|
||||
try:
|
||||
import ansi2html
|
||||
self.has_html = True
|
||||
except ImportError:
|
||||
print(('Warning: you have selected HTML output but do not ' +
|
||||
'have the ansi2html module installed. Rendering HTML ' +
|
||||
'output is not possible.'))
|
||||
self.has_html = False
|
||||
if 'color' in self.args and self.args['color']:
|
||||
if not self.args['html']:
|
||||
# Ensure that we support color output.
|
||||
curses.initscr()
|
||||
self.args['color'] = curses.can_change_color()
|
||||
curses.endwin()
|
||||
if not self.args['color'] and not self.args['raw']:
|
||||
raise RuntimeError('You have specified ANSI colorized '
|
||||
'output but your terminal does not '
|
||||
'support it. Use -fc/--force-color '
|
||||
'to force.')
|
||||
elif not self.args['color'] and self.args['raw']:
|
||||
self.args['color'] = True # Force the output anyways.
|
||||
if self.args['color']:
|
||||
if not self.args['raw']:
|
||||
self.colors = get_palette()
|
||||
else:
|
||||
self.colors = 8 # Best play it safe for maximum compatibility.
|
||||
# The full, interpreted path.
|
||||
if ('logfile' in self.args.keys() and
|
||||
self.args['logfile'] is not None):
|
||||
self.args['logfile'] = os.path.abspath(
|
||||
os.path.expanduser(
|
||||
self.args['logfile']))
|
||||
if not self.data:
|
||||
self.getlog()
|
||||
else:
|
||||
self.has_html = False
|
||||
# Conform everything to bytes.
|
||||
if not isinstance(self.data, bytes):
|
||||
self.data = self.data.encode('utf-8')
|
||||
self.decompressor()
|
||||
self.parser()
|
||||
|
||||
def getLog(self):
|
||||
if not os.path.isfile(self.args['logfile']):
|
||||
raise FileNotFoundError('{0} does not exist.'.formatself.args['logfile'])
|
||||
with open(self.args['logfile'], 'rb') as f:
|
||||
self.data = f.read()
|
||||
def getlog(self):
|
||||
# A filepath was specified
|
||||
if self.args['logfile']:
|
||||
if not os.path.isfile(self.args['logfile']):
|
||||
raise FileNotFoundError('{0} does not exist'.format(
|
||||
self.args['logfile']))
|
||||
with open(self.args['logfile'], 'rb') as f:
|
||||
self.data = f.read()
|
||||
# Try to get it from stdin
|
||||
else:
|
||||
if not sys.stdin.isatty():
|
||||
self.data = sys.stdin.buffer.read()
|
||||
else:
|
||||
raise ValueError('Either a path to a logfile must be '
|
||||
'specified or you must pipe a log in from '
|
||||
'stdin.')
|
||||
self.raw = self.data
|
||||
return()
|
||||
|
||||
def parseLog(self):
|
||||
if self.decompress:
|
||||
import importlib
|
||||
self.decmp = importlib.import_module(self.decompress)
|
||||
self.data = self.decmp.decompress(self.data)
|
||||
if self.args['color']:
|
||||
_idx = 0
|
||||
_datalst = self.data.split(b'\n')
|
||||
for line in _datalst[:]: # not really "lines", per se, but...
|
||||
# First we strip out some basic formatting at the beginning
|
||||
# of lines. Status lines are \x049/, chat lines are \x048/.
|
||||
# \x04g seem to be formatting resets of sort.
|
||||
line = re.sub('\x04[89]/'.encode('utf-8'),
|
||||
''.encode('utf-8'),
|
||||
line)
|
||||
line = re.sub('\x04g'.encode('utf-8'),
|
||||
''.encode('utf-8'),
|
||||
line)
|
||||
# Formatting resets
|
||||
line = re.sub('\x04e'.encode('utf-8'),
|
||||
'\033[0m'.encode('utf-8'),
|
||||
line)
|
||||
# Then we substitute bolds in. This is trickier, because
|
||||
# bolds (\x04c) *alternate*. So does the other? bold, \x02.
|
||||
for b in ('\x04c'.encode('utf-8'), '\x02'.encode('utf-8')):
|
||||
_linelst = line.split(b)
|
||||
_bold = False
|
||||
_cnt = 0
|
||||
for i in _linelst[:]:
|
||||
if _bold:
|
||||
_linelst[_cnt] = re.sub('^'.encode('utf-8'),
|
||||
(self.ansi_prefix + '1m').encode('utf-8'),
|
||||
i)
|
||||
else:
|
||||
_linelst[_cnt] = re.sub('^'.encode('utf-8'),
|
||||
(self.ansi_prefix + '0m').encode('utf-8'),
|
||||
i)
|
||||
_cnt += 1
|
||||
_bold = not _bold
|
||||
line = b''.join(_linelst)
|
||||
# Then we handle colors.
|
||||
_cnt = 0
|
||||
_linelst = line.split(b'\x03')
|
||||
for i in _linelst[:]:
|
||||
_color_idx = re.sub('^([0-9]{2}).*$'.encode('utf-8'),
|
||||
'\g<1>',
|
||||
i,
|
||||
re.MULTILINE).decode('utf-8')
|
||||
if _color_idx in self.colormap.keys():
|
||||
_linelst[_cnt] = re.sub('^[0-9]{2}'.encode('utf-8'),
|
||||
(self.ansi_prefix + self.colormap[_color_idx]).encode('utf-8'),
|
||||
i)
|
||||
_cnt += 1
|
||||
line = b''.join(_linelst)
|
||||
# Lastly, we fix join/part and other messages.
|
||||
_cnt = 0
|
||||
_linelst = line.split(b'\x04;/')
|
||||
for i in _linelst[:]:
|
||||
_templine = re.sub('^'.encode('utf-8'),
|
||||
''.encode('utf-8'),
|
||||
i,
|
||||
re.MULTILINE)
|
||||
_templine = re.sub('-!-'.encode('utf-8'),
|
||||
'\033[2m-!-'.encode('utf-8'),
|
||||
_templine)
|
||||
_linelst[_cnt] = re.sub('\x043/'.encode('utf-8'),
|
||||
''.encode('utf-8'),
|
||||
_templine)
|
||||
_cnt += 1
|
||||
line = re.sub(b'^\x1b\[0;32m\x1b\[0m\x1b\[0m', b'\033[0m', b''.join(_linelst))
|
||||
# Lastly we strip out \x04>/
|
||||
line = re.sub(b'\x04>/', b'', line)
|
||||
###
|
||||
_datalst[_idx] = line
|
||||
_idx += 1
|
||||
###
|
||||
self.data = b'\n'.join(_datalst)
|
||||
if self.args['html']:
|
||||
try:
|
||||
import ansi2html
|
||||
_has_html = True
|
||||
except ImportError:
|
||||
print(('Warning: you have selected HTML output but do not ' +
|
||||
'have the ansi2html module installed. Rendering HTML ' +
|
||||
'output is not possible.'))
|
||||
_has_html = False
|
||||
else:
|
||||
_has_html = False
|
||||
if _has_html:
|
||||
# This... basically sucks. It currently doesn't properly interpret the ANSI.
|
||||
_html = ansi2html.Ansi2HTMLConverter()
|
||||
self.data = _html.convert(self.data.decode('utf-8'))
|
||||
else: # We want plaintext, so strip ALL formatting.
|
||||
_stripbytes = ['\x04>/', '\x02', '\x043/', '\x048/', '\x049/', '\x04g', '\x04e', '\x04c', '\x04;/']
|
||||
for b in _stripbytes:
|
||||
self.data = re.sub(b.encode('utf-8'), ''.encode('utf-8'), self.data)
|
||||
self.data = re.sub('\\x03[0-9]{2}'.encode('utf-8'), ''.encode('utf-8'), self.data)
|
||||
def decompressor(self):
|
||||
# TODO: use mime module as fallback?
|
||||
# https://docs.python.org/3/library/mimetypes.html
|
||||
# VERY less-than-ideal since it won't work without self.args['logfile']
|
||||
# (and has iffy detection at best, since it relies on file extensions).
|
||||
# Determine what decompressor to use, if we need to.
|
||||
if has_magic:
|
||||
_mime = magic.detect_from_content(self.data).mime_type
|
||||
self.decompress = cmprsn_map[_mime]
|
||||
if self.decompress:
|
||||
import importlib
|
||||
decmp = importlib.import_module(self.decompress)
|
||||
self.raw = decmp.decompress(self.data)
|
||||
else:
|
||||
# Assume that it's text and that it isn't compressed.
|
||||
# We'll get a UnicodeDecodeError exception if it isn't.
|
||||
pass
|
||||
try:
|
||||
self.raw = self.data.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
self.data = self.raw
|
||||
return()
|
||||
|
||||
def parser(self):
|
||||
if 'color' not in self.args or not self.args['color']:
|
||||
self.data = plain_stripper(self.data)
|
||||
else:
|
||||
self.data = color_converter(self.data, self.colors)
|
||||
# Just in case...
|
||||
self.data += '\x1b[0m'
|
||||
return()
|
||||
|
||||
def parseArgs():
|
||||
@ -181,19 +277,41 @@ def parseArgs():
|
||||
args.add_argument('-c', '--color',
|
||||
dest = 'color',
|
||||
action = 'store_true',
|
||||
help = ('Print the log with converted colors.'))
|
||||
help = ('Print the log with converted colors (ANSI)'))
|
||||
args.add_argument('-r', '--raw',
|
||||
dest = 'raw',
|
||||
action = 'store_true',
|
||||
help = ('Use this switch if your terminal is detected '
|
||||
'as not supporting color output but wish to '
|
||||
'force it anyways. A string representation of '
|
||||
'the ANSI output will be produced instead ('
|
||||
'suitable for pasting elsewhere). Only used if '
|
||||
'-c/--color is enabled (ignored with '
|
||||
'-H/--html)'))
|
||||
args.add_argument('-H', '--html',
|
||||
dest = 'html',
|
||||
action = 'store_true',
|
||||
help = ('Render HTML output.'))
|
||||
help = ('Render HTML output'))
|
||||
args.add_argument(dest = 'logfile',
|
||||
default = None,
|
||||
nargs = '?',
|
||||
metavar = 'path/to/logfile',
|
||||
help = ('The path to the log file. It can be uncompressed ' +
|
||||
'or compressed with XZ/LZMA, Gzip, or Bzip2.'))
|
||||
'or compressed with XZ/LZMA, Gzip, or Bzip2. '
|
||||
'If not specified, read from stdin'))
|
||||
return(args)
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = vars(parseArgs().parse_args())
|
||||
l = logParser(args)
|
||||
l.parseLog()
|
||||
print(l.data.decode('utf-8'))
|
||||
l = irssiLogParser(args)
|
||||
import shutil
|
||||
cols = shutil.get_terminal_size().columns
|
||||
#pprint.pprint(l.args, width = cols)
|
||||
pprint.pprint(l.raw, width = cols)
|
||||
with open('/tmp/freenode.formatted', 'r') as f:
|
||||
print(f.read())
|
||||
#pprint.pprint(l.data, width = cols)
|
||||
#pprint.pprint(repr(l.data).split('\\n'))
|
||||
print(l.data)
|
||||
# l.parseLog()
|
||||
# print(l.data.decode('utf-8'))
|
||||
|
Loading…
Reference in New Issue
Block a user