gotosocial/vendor/github.com/k3a/html2text/html2text.go

package html2text

import (
	"bytes"
	"regexp"
	"strconv"
	"strings"
)

// Line break constants
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
const (
	WIN_LBR  = "\r\n"
	UNIX_LBR = "\n"
)

var legacyLBR = WIN_LBR
var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
var badLinkHrefRE = regexp.MustCompile(`javascript:`)
var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)

type options struct {
	lbr            string
	linksInnerText bool
	listPrefix     string
}

func newOptions() *options {
	// apply defaults
	return &options{
		lbr: WIN_LBR,
	}
}

// Option is a functional option
type Option func(*options)

// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
func WithUnixLineBreaks() Option {
	return func(o *options) {
		o.lbr = UNIX_LBR
	}
}

// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
// Example: click news <http://bit.ly/2n4wXRs>
func WithLinksInnerText() Option {
	return func(o *options) {
		o.linksInnerText = true
	}
}

// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
func WithListSupportPrefix(prefix string) Option {
	return func(o *options) {
		o.listPrefix = prefix
	}
}

// WithListSupport formats <ul> and <li> lists with " - " prefix
func WithListSupport() Option {
	return WithListSupportPrefix(" - ")
}

func parseHTMLEntity(entName string) (string, bool) {
	if r, ok := entity[entName]; ok {
		return string(r), true
	}

	if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
		var (
			err    error
			n      int64
			digits = match[1]
		)

		if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
			n, err = strconv.ParseInt(digits[1:], 16, 64)
		} else {
			n, err = strconv.ParseInt(digits, 10, 64)
		}

		if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
			return string(rune(n)), true
		}
	}

	return "", false
}

// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
func SetUnixLbr(b bool) {
	if b {
		legacyLBR = UNIX_LBR
	} else {
		legacyLBR = WIN_LBR
	}
}

// HTMLEntitiesToText decodes HTML entities inside a provided
// string and returns decoded text
func HTMLEntitiesToText(htmlEntsText string) string {
	outBuf := bytes.NewBufferString("")
	inEnt := false

	for i, r := range htmlEntsText {
		switch {
		case r == ';' && inEnt:
			inEnt = false
			continue

		case r == '&': //possible html entity
			entName := ""
			isEnt := false

			// parse the entity name - max 10 chars
			chars := 0
			for _, er := range htmlEntsText[i+1:] {
				if er == ';' {
					isEnt = true
					break
				} else {
					entName += string(er)
				}

				chars++
				if chars == 10 {
					break
				}
			}

			if isEnt {
				if ent, isEnt := parseHTMLEntity(entName); isEnt {
					outBuf.WriteString(ent)
					inEnt = true
					continue
				}
			}
		}

		if !inEnt {
			outBuf.WriteRune(r)
		}
	}

	return outBuf.String()
}

func writeSpace(outBuf *bytes.Buffer) {
	bts := outBuf.Bytes()
	if len(bts) > 0 && bts[len(bts)-1] != ' ' {
		outBuf.WriteString(" ")
	}
}

// HTML2Text converts html into a text form
func HTML2Text(html string) string {
	var opts []Option
	if legacyLBR == UNIX_LBR {
		opts = append(opts, WithUnixLineBreaks())
	}
	return HTML2TextWithOptions(html, opts...)
}

// HTML2TextWithOptions converts html into a text form with additional options
func HTML2TextWithOptions(html string, reqOpts ...Option) string {
	opts := newOptions()
	for _, opt := range reqOpts {
		opt(opts)
	}

	inLen := len(html)
	tagStart := 0
	inEnt := false
	badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
	shouldOutput := true
	// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
	hrefs := []string{}
	// new line cannot be printed at the beginning or
	// for <p> after a new line created by previous <p></p>
	canPrintNewline := false

	outBuf := bytes.NewBufferString("")

	for i, r := range html {
		if inLen > 0 && i == inLen-1 {
			// prevent new line at the end of the document
			canPrintNewline = false
		}

		switch {
		// skip new lines and spaces adding a single space if not there yet
		case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
			r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
			if shouldOutput && badTagStackDepth == 0 && !inEnt {
				//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
				writeSpace(outBuf)
			}
			continue

		case r == ';' && inEnt: // end of html entity
			inEnt = false
			continue

		case r == '&' && shouldOutput: // possible html entity
			entName := ""
			isEnt := false

			// parse the entity name - max 10 chars
			chars := 0
			for _, er := range html[i+1:] {
				if er == ';' {
					isEnt = true
					break
				} else {
					entName += string(er)
				}

				chars++
				if chars == 10 {
					break
				}
			}

			if isEnt {
				if ent, isEnt := parseHTMLEntity(entName); isEnt {
					outBuf.WriteString(ent)
					inEnt = true
					continue
				}
			}

		case r == '<': // start of a tag
			tagStart = i + 1
			shouldOutput = false
			continue

		case r == '>': // end of a tag
			shouldOutput = true
			tag := html[tagStart:i]
			tagNameLowercase := strings.ToLower(tag)

			if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
				outBuf.WriteString(opts.lbr)
			} else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
				if opts.listPrefix != "" {
					outBuf.WriteString(opts.lbr + opts.listPrefix)
				} else {
					outBuf.WriteString(opts.lbr)
				}
			} else if headersRE.MatchString(tagNameLowercase) {
				if canPrintNewline {
					outBuf.WriteString(opts.lbr + opts.lbr)
				}
				canPrintNewline = false
			} else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
				// new line
				outBuf.WriteString(opts.lbr)
			} else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
				if canPrintNewline {
					outBuf.WriteString(opts.lbr + opts.lbr)
				}
				canPrintNewline = false
			} else if opts.linksInnerText && tagNameLowercase == "/a" {
				// end of link
				// links can be empty can happen if the link matches the badLinkHrefRE
				if len(hrefs) > 0 {
					outBuf.WriteString(" <")
					outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
					outBuf.WriteString(">")
					hrefs = hrefs[1:]
				}
			} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
				// parse link href
				// add special handling for a tags
				m := linkTagRE.FindStringSubmatch(tag)
				if len(m) == 5 {
					link := m[2]
					if len(link) == 0 {
						link = m[3]
						if len(link) == 0 {
							link = m[4]
						}
					}

					if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
						hrefs = append(hrefs, link)
					}
				}
			} else if badTagnamesRE.MatchString(tagNameLowercase) {
				// unwanted block
				badTagStackDepth++

				// if link inner text preservation is not enabled
				// and the current tag is a link tag, parse its href and output that
				if !opts.linksInnerText {
					// parse link href
					m := linkTagRE.FindStringSubmatch(tag)
					if len(m) == 5 {
						link := m[2]
						if len(link) == 0 {
							link = m[3]
							if len(link) == 0 {
								link = m[4]
							}
						}

						if !badLinkHrefRE.MatchString(link) {
							outBuf.WriteString(HTMLEntitiesToText(link))
						}
					}
				}
			} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
				badTagnamesRE.MatchString(tagNameLowercase[1:]) {
				// end of unwanted block
				badTagStackDepth--
			}
			continue

		} // switch end

		if shouldOutput && badTagStackDepth == 0 && !inEnt {
			canPrintNewline = true
			outBuf.WriteRune(r)
		}
	}

	return outBuf.String()
}
[bugfix] Use better plaintext representation of status for filtering (#3301) * [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain 2024-09-16 15:00:23 +03:00			`package html2text`

			`import (`
			`"bytes"`
			`"regexp"`
			`"strconv"`
			`"strings"`
			`)`

			`// Line break constants`
			`// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())`
			`const (`
			`WIN_LBR = "\r\n"`
			`UNIX_LBR = "\n"`
			`)`

			`var legacyLBR = WIN_LBR`
			var badTagnamesRE = regexp.MustCompile(`^(head\|script\|style\|a)($\|\s+)`)
			var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$\|\s).(?i:href)\s=\s('([^']?)'\|"([^"]*?)"\|([^\s"'` + "`" + `=<>]+))`)
			var badLinkHrefRE = regexp.MustCompile(`javascript:`)
			var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
			var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)

			`type options struct {`
			`lbr string`
			`linksInnerText bool`
			`listPrefix string`
			`}`

			`func newOptions() *options {`
			`// apply defaults`
			`return &options{`
			`lbr: WIN_LBR,`
			`}`
			`}`

			`// Option is a functional option`
			`type Option func(*options)`

			`// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)`
			`func WithUnixLineBreaks() Option {`
			`return func(o *options) {`
			`o.lbr = UNIX_LBR`
			`}`
			`}`

			`// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text`
			`// Example: click news <http://bit.ly/2n4wXRs>`
			`func WithLinksInnerText() Option {`
			`return func(o *options) {`
			`o.linksInnerText = true`
			`}`
			`}`

			`// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix`
			`func WithListSupportPrefix(prefix string) Option {`
			`return func(o *options) {`
			`o.listPrefix = prefix`
			`}`
			`}`

			`// WithListSupport formats <ul> and <li> lists with " - " prefix`
			`func WithListSupport() Option {`
			`return WithListSupportPrefix(" - ")`
			`}`

			`func parseHTMLEntity(entName string) (string, bool) {`
			`if r, ok := entity[entName]; ok {`
			`return string(r), true`
			`}`

			`if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {`
			`var (`
			`err error`
			`n int64`
			`digits = match[1]`
			`)`

			`if digits != "" && (digits[0] == 'x' \|\| digits[0] == 'X') {`
			`n, err = strconv.ParseInt(digits[1:], 16, 64)`
			`} else {`
			`n, err = strconv.ParseInt(digits, 10, 64)`
			`}`

			`if err == nil && (n == 9 \|\| n == 10 \|\| n == 13 \|\| n > 31) {`
			`return string(rune(n)), true`
			`}`
			`}`

			`return "", false`
			`}`

			`// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")`
			`// with argument false sets Windows-style line-breaks in output ("\r\n", the default)`
			`// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())`
			`func SetUnixLbr(b bool) {`
			`if b {`
			`legacyLBR = UNIX_LBR`
			`} else {`
			`legacyLBR = WIN_LBR`
			`}`
			`}`

			`// HTMLEntitiesToText decodes HTML entities inside a provided`
			`// string and returns decoded text`
			`func HTMLEntitiesToText(htmlEntsText string) string {`
			`outBuf := bytes.NewBufferString("")`
			`inEnt := false`

			`for i, r := range htmlEntsText {`
			`switch {`
			`case r == ';' && inEnt:`
			`inEnt = false`
			`continue`

			`case r == '&': //possible html entity`
			`entName := ""`
			`isEnt := false`

			`// parse the entity name - max 10 chars`
			`chars := 0`
			`for _, er := range htmlEntsText[i+1:] {`
			`if er == ';' {`
			`isEnt = true`
			`break`
			`} else {`
			`entName += string(er)`
			`}`

			`chars++`
			`if chars == 10 {`
			`break`
			`}`
			`}`

			`if isEnt {`
			`if ent, isEnt := parseHTMLEntity(entName); isEnt {`
			`outBuf.WriteString(ent)`
			`inEnt = true`
			`continue`
			`}`
			`}`
			`}`

			`if !inEnt {`
			`outBuf.WriteRune(r)`
			`}`
			`}`

			`return outBuf.String()`
			`}`

			`func writeSpace(outBuf *bytes.Buffer) {`
			`bts := outBuf.Bytes()`
			`if len(bts) > 0 && bts[len(bts)-1] != ' ' {`
			`outBuf.WriteString(" ")`
			`}`
			`}`

			`// HTML2Text converts html into a text form`
			`func HTML2Text(html string) string {`
			`var opts []Option`
			`if legacyLBR == UNIX_LBR {`
			`opts = append(opts, WithUnixLineBreaks())`
			`}`
			`return HTML2TextWithOptions(html, opts...)`
			`}`

			`// HTML2TextWithOptions converts html into a text form with additional options`
			`func HTML2TextWithOptions(html string, reqOpts ...Option) string {`
			`opts := newOptions()`
			`for _, opt := range reqOpts {`
			`opt(opts)`
			`}`

			`inLen := len(html)`
			`tagStart := 0`
			`inEnt := false`
			`badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>`
			`shouldOutput := true`
			`// maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)`
			`hrefs := []string{}`
			`// new line cannot be printed at the beginning or`
			`// for <p> after a new line created by previous <p></p>`
			`canPrintNewline := false`

			`outBuf := bytes.NewBufferString("")`

			`for i, r := range html {`
			`if inLen > 0 && i == inLen-1 {`
			`// prevent new line at the end of the document`
			`canPrintNewline = false`
			`}`

			`switch {`
			`// skip new lines and spaces adding a single space if not there yet`
			`case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines`
			`r == ' ', r >= 0x2008 && r <= 0x200B: // spaces`
			`if shouldOutput && badTagStackDepth == 0 && !inEnt {`
			`//outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))`
			`writeSpace(outBuf)`
			`}`
			`continue`

			`case r == ';' && inEnt: // end of html entity`
			`inEnt = false`
			`continue`

			`case r == '&' && shouldOutput: // possible html entity`
			`entName := ""`
			`isEnt := false`

			`// parse the entity name - max 10 chars`
			`chars := 0`
			`for _, er := range html[i+1:] {`
			`if er == ';' {`
			`isEnt = true`
			`break`
			`} else {`
			`entName += string(er)`
			`}`

			`chars++`
			`if chars == 10 {`
			`break`
			`}`
			`}`

			`if isEnt {`
			`if ent, isEnt := parseHTMLEntity(entName); isEnt {`
			`outBuf.WriteString(ent)`
			`inEnt = true`
			`continue`
			`}`
			`}`

			`case r == '<': // start of a tag`
			`tagStart = i + 1`
			`shouldOutput = false`
			`continue`

			`case r == '>': // end of a tag`
			`shouldOutput = true`
			`tag := html[tagStart:i]`
			`tagNameLowercase := strings.ToLower(tag)`

			`if tagNameLowercase == "/ul" \|\| tagNameLowercase == "/ol" {`
			`outBuf.WriteString(opts.lbr)`
			`} else if tagNameLowercase == "li" \|\| tagNameLowercase == "li/" {`
			`if opts.listPrefix != "" {`
			`outBuf.WriteString(opts.lbr + opts.listPrefix)`
			`} else {`
			`outBuf.WriteString(opts.lbr)`
			`}`
			`} else if headersRE.MatchString(tagNameLowercase) {`
			`if canPrintNewline {`
			`outBuf.WriteString(opts.lbr + opts.lbr)`
			`}`
			`canPrintNewline = false`
			`} else if tagNameLowercase == "br" \|\| tagNameLowercase == "br/" {`
			`// new line`
			`outBuf.WriteString(opts.lbr)`
			`} else if tagNameLowercase == "p" \|\| tagNameLowercase == "/p" {`
			`if canPrintNewline {`
			`outBuf.WriteString(opts.lbr + opts.lbr)`
			`}`
			`canPrintNewline = false`
			`} else if opts.linksInnerText && tagNameLowercase == "/a" {`
			`// end of link`
			`// links can be empty can happen if the link matches the badLinkHrefRE`
			`if len(hrefs) > 0 {`
			`outBuf.WriteString(" <")`
			`outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))`
			`outBuf.WriteString(">")`
			`hrefs = hrefs[1:]`
			`}`
			`} else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {`
			`// parse link href`
			`// add special handling for a tags`
			`m := linkTagRE.FindStringSubmatch(tag)`
			`if len(m) == 5 {`
			`link := m[2]`
			`if len(link) == 0 {`
			`link = m[3]`
			`if len(link) == 0 {`
			`link = m[4]`
			`}`
			`}`

			`if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {`
			`hrefs = append(hrefs, link)`
			`}`
			`}`
			`} else if badTagnamesRE.MatchString(tagNameLowercase) {`
			`// unwanted block`
			`badTagStackDepth++`

			`// if link inner text preservation is not enabled`
			`// and the current tag is a link tag, parse its href and output that`
			`if !opts.linksInnerText {`
			`// parse link href`
			`m := linkTagRE.FindStringSubmatch(tag)`
			`if len(m) == 5 {`
			`link := m[2]`
			`if len(link) == 0 {`
			`link = m[3]`
			`if len(link) == 0 {`
			`link = m[4]`
			`}`
			`}`

			`if !badLinkHrefRE.MatchString(link) {`
			`outBuf.WriteString(HTMLEntitiesToText(link))`
			`}`
			`}`
			`}`
			`} else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&`
			`badTagnamesRE.MatchString(tagNameLowercase[1:]) {`
			`// end of unwanted block`
			`badTagStackDepth--`
			`}`
			`continue`

			`} // switch end`

			`if shouldOutput && badTagStackDepth == 0 && !inEnt {`
			`canPrintNewline = true`
			`outBuf.WriteRune(r)`
			`}`
			`}`

			`return outBuf.String()`
			`}`