go_goutils/stringsx/funcs.go

package stringsx

import (
	`bytes`
	`errors`
	`fmt`
	`io`
	`slices`
	`strings`
	`unicode`
)

/*
IsAscii returns true if all characters in string s are ASCII.

This simply wraps [IsAsciiSpecial]:

	isAscii, err = IsAsciiSpecial(s, allowCtl, true, allowExt, true, nil, nil)
*/
func IsAscii(s string, allowCtl, allowExt bool) (isAscii bool, err error) {

	if isAscii, err = IsAsciiSpecial(
		s, allowCtl, true, allowExt, true, nil, nil,
	); err != nil {
		return
	}

	return
}

/*
IsAsciiBuf returns true if all of buffer buf is valid ASCII.

Note that the buffer will be consumed/read by this function.

This simply wraps [IsAsciiBufSpecial]:

	isAscii, err = IsAsciiBufSpecial(r, allowCtl, true, allowExt, true, nil, nil)
*/
func IsAsciiBuf(r io.RuneReader, allowCtl, allowExt bool) (isAscii bool, err error) {

	if isAscii, err = IsAsciiBufSpecial(
		r, allowCtl, true, allowExt, true, nil, nil,
	); err != nil {
		return
	}

	return
}

/*
IsAsciiSpecial allows for specifying specific ASCII ranges.

allowCtl, if true, will allow control characters (0x00 to 0x1f inclusive).

allowPrint, if true, will allow printable characters (what most people think of
when they say "ASCII") (0x20 to 0x7f inclusive).

allowExt, if true, will allow for "extended ASCII" - some later dialects expand
to a full 8-bit ASCII range (0x80 to 0xff inclusive).

wsCtl, if true, "shifts" the "whitespace control characters" (\t, \n, \r) to the "printable" space
(such that allowPrint controls their validation). Thus:

	IsAsciiSpecial(s, false, true, false, true, nil, nil)

has the same effect as specifying:

	IsAsciiSpecial(s, false, true, false, (-), []byte("\t\n\r"), nil)

incl, if non-nil and non-empty, allows *additional* characters to be specified as included
that would normally *not* be allowed.

excl, if non-nil and non-empty, invalidates on additional characters that would normally be allowed.

excl, if specified, takes precedence over incl if specified.

An [AsciiInvalidError] will be returned on the first encountered invalid character.
*/
func IsAsciiSpecial(s string, allowCtl, allowPrint, allowExt, allowWs bool, incl, excl []byte) (isAscii bool, err error) {

	var buf *bytes.Buffer = bytes.NewBufferString(s)

	if isAscii, err = IsAsciiBufSpecial(buf, allowCtl, allowPrint, allowExt, allowWs, incl, excl); err != nil {
		return
	}

	return
}

/*
IsAsciiBufSpecial is the same as [IsAsciiSpecial] but operates on an [io.RuneReader].

Note that the buffer will be consumed/read by this function.

It will not return an [io.EOF] if encountered, but any other errors encountered will be returned.
It is expected that r will return an [io.EOF] when exhausted.

An [AsciiInvalidError] will be returned on the first encountered invalid character.
*/
func IsAsciiBufSpecial(r io.RuneReader, allowCtl, allowPrint, allowExt, allowWs bool, incl, excl []byte) (isAscii bool, err error) {

	var b rune
	var bLen int
	var nextNewline bool
	var tmpErr *AsciiInvalidError = new(AsciiInvalidError)
	// I know, I know. This is essentually a lookup table. Keeps it speedy.
	var allowed [256]bool = getAsciiCharMap(allowCtl, allowPrint, allowExt, allowWs, incl, excl)

	for {
		if b, bLen, err = r.ReadRune(); err != nil {
			if errors.Is(err, io.EOF) {
				err = nil
				isAscii = true
			}
			return
		}
		// Set these *before* OK
		if nextNewline {
			tmpErr.Line++
			tmpErr.LineByte = 0
			tmpErr.LineChar = 0
			nextNewline = false
		} else {
			tmpErr.LineChar++
		}
		tmpErr.Char++

		if b == '\n' {
			nextNewline = true
		}
		if b == rune(0xfffd) {
			// not even valid unicode
			tmpErr.BadChar = b
			tmpErr.BadBytes = []byte(string(b))
			err = tmpErr
			return
		}
		if bLen > 2 || b > 0xff {
			// ASCII only occupies a single byte, ISO-8859-1 occupies 2
			tmpErr.BadChar = b
			tmpErr.BadBytes = []byte(string(b))
			err = tmpErr
			return
		}
		if !allowed[byte(b)] {
			tmpErr.BadChar = b
			tmpErr.BadBytes = []byte{byte(b)}
			err = tmpErr
			return
		}

		// Set these *after* OK
		tmpErr.LineByte += uint64(bLen)
		tmpErr.Byte += uint64(bLen)
	}

	isAscii = true

	return
}

/*
LenSplit formats string `s` to break at, at most, every `width` characters.

Any existing newlines (e.g. \r\n) will be removed during a string/
substring/line's length calculation. (e.g. `foobarbaz\n` and `foobarbaz\r\n` are
both considered to be lines of length 9, not 10 and 11 respectively).

This also means that any newlines (\n or \r\n) are inherently removed from
`out` (even if included in `wordWrap`; see below).

Note that if `s` is multiline (already contains newlines), they will be respected
as-is - that is, if a line ends with less than `width` chars and then has a newline,
it will be preserved as an empty element. That is to say:

	"foo\nbar\n\n" → []string{"foo", "bar", ""}
	"foo\n\nbar\n" → []string{"foo", "", "bar"}

This splitter is particularly simple. If you need wordwrapping, it should be done
with e.g. [github.com/muesli/reflow/wordwrap].
*/
func LenSplit(s string, width uint) (out []string) {

	var end int
	var line string
	var lineRunes []rune

	if width == 0 {
		out = []string{s}
		return
	}

	for line = range strings.Lines(s) {
		line = strings.TrimRight(line, "\n")
		line = strings.TrimRight(line, "\r")

		lineRunes = []rune(line)

		if uint(len(lineRunes)) <= width {
			out = append(out, line)
			continue
		}

		for i := 0; i < len(lineRunes); i += int(width) {
			end = i + int(width)
			if end > len(lineRunes) {
				end = len(lineRunes)
			}
			out = append(out, string(lineRunes[i:end]))
		}
	}

	return
}

/*
LenSplitStr wraps [LenSplit] but recombines into a new string with newlines.

It's mostly just a convenience wrapper.

All arguments remain the same as in [LenSplit] with an additional one,
`winNewLine`, which if true will use \r\n as the newline instead of \n.
*/
func LenSplitStr(s string, width uint, winNewline bool) (out string) {

	var outSl []string = LenSplit(s, width)

	if winNewline {
		out = strings.Join(outSl, "\r\n")
	} else {
		out = strings.Join(outSl, "\n")
	}

	return
}

/*
Pad pads each element in `s` to length `width` using `pad`.
If `pad` is empty, a single space (0x20) will be assumed.
Note that `width` operates on rune size, not byte size.
(In ASCII, they will be the same size.)

If a line in `s` is greater than or equal to `width`,
no padding will be performed.

If `leftPad` is true, padding will be applied to the "left" (beginning")
of each element instead of the "right" ("end").
*/
func Pad(s []string, width uint, pad string, leftPad bool) (out []string) {

	var idx int
	var padIdx int
	var runeIdx int
	var padLen uint
	var elem string
	var unpadLen uint
	var tmpPadLen int
	var padRunes []rune
	var tmpPad []rune

	if width == 0 {
		out = s
		return
	}

	out = make([]string, len(s))

	// Easy; supported directly in fmt.
	if pad == "" {
		for idx, elem = range s {
			if leftPad {
				out[idx] = fmt.Sprintf("%*s", width, elem)
			} else {
				out[idx] = fmt.Sprintf("%-*s", width, elem)
			}
		}
		return
	}

	// This gets a little more tricky.
	padRunes = []rune(pad)
	padLen = uint(len(padRunes))
	for idx, elem = range s {
		// First we need to know the number of runes in elem.
		unpadLen = uint(len([]rune(elem)))
		// If it's more than/equal to width, as-is.
		if unpadLen >= width {
			out[idx] = elem
		} else {
			// Otherwise, we need to construct/calculate a pad.
			if (width-unpadLen)%padLen == 0 {
				// Also easy enough.
				if leftPad {
					out[idx] = fmt.Sprintf("%s%s", strings.Repeat(pad, int((width-unpadLen)/padLen)), elem)
				} else {
					out[idx] = fmt.Sprintf("%s%s", elem, strings.Repeat(pad, int((width-unpadLen)/padLen)))
				}
			} else {
				// This is where it gets a little hairy.
				tmpPad = []rune{}
				tmpPadLen = int(width - unpadLen)
				idx = 0
				padIdx = 0
				for runeIdx = range tmpPadLen {
					tmpPad[runeIdx] = padRunes[padIdx]
					if uint(padIdx) >= padLen {
						padIdx = 0
					} else {
						padIdx++
					}
					runeIdx++
				}
				if leftPad {
					out[idx] = fmt.Sprintf("%s%s", string(tmpPad), elem)
				} else {
					out[idx] = fmt.Sprintf("%s%s", elem, string(tmpPad))
				}
			}
		}
	}

	return
}

/*
Redact provides a "masked" version of string s (e.g. `my_terrible_password` -> `my****************rd`).

maskStr is the character or sequence of characters
to repeat for every masked character of s.
If an empty string, the default [DefMaskStr] will be used.
(maskStr does not need to be a single character.
It is recommended to use a multi-char mask to help obfuscate a string's length.)

leading specifies the number of leading characters of s to leave *unmasked*.
If 0, no leading characters will be unmasked.

trailing specifies the number of trailing characters of s to leave *unmasked*.
if 0, no trailing characters will be unmasked.

newlines, if true, will preserve newline characters - otherwise
they will be treated as regular characters.

As a safety precaution, if:

	len(s) <= (leading + trailing)

then the entire string will be *masked* and no unmasking will be performed.

Note that this DOES NOT do a string *replace*, it provides a masked version of `s` itself.
Wrap Redact with [strings.ReplaceAll] if you want to replace a certain value with a masked one.
*/
func Redact(s, maskStr string, leading, trailing uint, newlines bool) (redacted string) {

	var nl string
	var numMasked int
	var sb strings.Builder
	var endIdx int = int(leading)

	// This condition functionally won't do anything, so just return the input as-is.
	if s == "" {
		return
	}

	if maskStr == "" {
		maskStr = DefMaskStr
	}

	if newlines {
		for line := range strings.Lines(s) {
			nl = getNewLine(line)
			sb.WriteString(
				Redact(
					strings.TrimSuffix(line, nl), maskStr, leading, trailing, false,
				),
			)
			sb.WriteString(nl)
		}
	} else {
		if len(s) <= int(leading+trailing) {
			redacted = strings.Repeat(maskStr, len(s))
			return
		}

		if leading == 0 && trailing == 0 {
			redacted = strings.Repeat(maskStr, len(s))
			return
		}

		numMasked = len(s) - int(leading+trailing)
		endIdx = endIdx + numMasked

		if leading > 0 {
			sb.WriteString(s[:int(leading)])
		}

		sb.WriteString(strings.Repeat(maskStr, numMasked))

		if trailing > 0 {
			sb.WriteString(s[endIdx:])
		}
	}

	redacted = sb.String()

	return
}

// Reverse reverses string s. (It's absolutely insane that this isn't in stdlib.)
func Reverse(s string) (revS string) {

	var rsl []rune = []rune(s)

	slices.Reverse(rsl)

	revS = string(rsl)

	return
}

/*
TrimLines is like [strings.TrimSpace] but operates on *each line* of s.
It is *NIX-newline (`\n`) vs. Windows-newline (`\r\n`) agnostic.
The first encountered linebreak (`\n` vs. `\r\n`) are assumed to be
the canonical linebreak for the rest of s.

left, if true, performs a [TrimSpaceLeft] on each line (retaining the newline).

right, if true, performs a [TrimSpaceRight] on each line (retaining the newline).
*/
func TrimLines(s string, left, right bool) (trimmed string) {

	var sl string
	var nl string
	var sb strings.Builder

	// These conditions functionally won't do anything, so just return the input as-is.
	if s == "" {
		return
	}
	if !left && !right {
		trimmed = s
		return
	}

	for line := range strings.Lines(s) {
		nl = getNewLine(line)
		sl = strings.TrimSuffix(line, nl)
		if left && right {
			sl = strings.TrimSpace(sl)
		} else if left {
			sl = TrimSpaceLeft(sl)
		} else if right {
			sl = TrimSpaceRight(sl)
		}
		sb.WriteString(sl + nl)
	}

	trimmed = sb.String()

	return
}

// TrimSpaceLeft is like [strings.TrimSpace] but only removes leading whitespace from string `s`.
func TrimSpaceLeft(s string) (trimmed string) {

	trimmed = strings.TrimLeftFunc(s, unicode.IsSpace)

	return
}

/*
TrimSpaceRight is like [strings.TrimSpace] but only removes trailing whitespace from string s.
*/
func TrimSpaceRight(s string) (trimmed string) {

	trimmed = strings.TrimRightFunc(s, unicode.IsSpace)

	return
}

// getAsciiCharMap returns a lookup "table" for ASCII characters.
func getAsciiCharMap(allowCtl, allowPrint, allowExt, allowWs bool, incl, excl []byte) (charmap [256]bool) {

	var idx uint8

	if allowCtl {
		for idx < 0x1f {
			charmap[idx] = true
			idx++
		}
	} else {
		idx = 0x1f
	}
	if allowPrint {
		for idx < 0x7f {
			charmap[idx] = true
			idx++
		}
	} else {
		idx = 0x7f
	}
	if allowExt {
		for {
			charmap[idx] = true
			if idx == 0xff {
				break
			}
			idx++
		}
	} else {
		idx = 0xff
	}
	if allowWs {
		charmap['\t'] = true
		charmap['\n'] = true
		charmap['\r'] = true
	}

	if incl != nil && len(incl) > 0 {
		for _, idx = range incl {
			charmap[idx] = true
		}
	}
	if excl != nil && len(excl) > 0 {
		for _, idx = range excl {
			charmap[idx] = false
		}
	}

	return
}

// getNewLine is too unpredictable/nuanced to be used as part of a public API promise so it isn't exported.
func getNewLine(s string) (nl string) {

	if strings.HasSuffix(s, "\r\n") {
		nl = "\r\n"
	} else if strings.HasSuffix(s, "\n") {
		nl = "\n"
	}

	return
}