gitea/modules/charset/escape_stream.go

// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package charset

import (
	"bytes"
	"fmt"
	"html"
	"io"
	"unicode"
	"unicode/utf8"

	"code.gitea.io/gitea/modules/setting"
	"code.gitea.io/gitea/modules/translation"
)

type htmlChunkReader struct {
	in       io.Reader
	readErr  error
	readBuf  []byte
	curInTag bool
}

type escapeStreamer struct {
	htmlChunkReader

	escaped         *EscapeStatus
	locale          translation.Locale
	ambiguousTables []*AmbiguousTable
	allowed         map[rune]bool

	out io.Writer
}

func escapeStream(locale translation.Locale, in io.Reader, out io.Writer, opts ...EscapeOptions) (*EscapeStatus, error) {
	es := &escapeStreamer{
		escaped:         &EscapeStatus{},
		locale:          locale,
		ambiguousTables: AmbiguousTablesForLocale(locale),
		htmlChunkReader: htmlChunkReader{
			in:      in,
			readBuf: make([]byte, 0, 32*1024),
		},
		out: out,
	}

	if len(opts) > 0 {
		es.allowed = opts[0].Allowed
	}

	readCount := 0
	lastIsTag := false
	for {
		parts, partInTag, err := es.readRunes()
		readCount++
		if err == io.EOF {
			return es.escaped, nil
		} else if err != nil {
			return nil, err
		}
		for i, part := range parts {
			if partInTag[i] {
				lastIsTag = true
				if _, err := out.Write(part); err != nil {
					return nil, err
				}
			} else {
				// if last part is tag, then this part is content begin
				// if the content is the first part of the first read, then it's also content begin
				isContentBegin := lastIsTag || (readCount == 1 && i == 0)
				lastIsTag = false
				if isContentBegin {
					if part, err = es.trimAndWriteBom(part); err != nil {
						return nil, err
					}
				}
				if err = es.detectAndWriteRunes(part); err != nil {
					return nil, err
				}
			}
		}
	}
}

func (e *escapeStreamer) trimAndWriteBom(part []byte) ([]byte, error) {
	remaining, ok := bytes.CutPrefix(part, globalVars().utf8Bom)
	if ok {
		part = remaining
		if _, err := e.out.Write(globalVars().utf8Bom); err != nil {
			return part, err
		}
	}
	return part, nil
}

const longSentenceDetectionLimit = 20

func (e *escapeStreamer) possibleLongSentence(results []detectResult, pos int) bool {
	countBasic := 0
	countNonASCII := 0
	for i := max(pos-longSentenceDetectionLimit, 0); i < min(pos+longSentenceDetectionLimit, len(results)); i++ {
		if results[i].runeType == runeTypeBasic && results[i].runeChar != ' ' {
			countBasic++
		}
		if results[i].runeType == runeTypeNonASCII || results[i].runeType == runeTypeAmbiguous {
			countNonASCII++
		}
	}
	countChar := countBasic + countNonASCII
	// many non-ASCII runes around, it seems to be a sentence,
	// don't handle the invisible/ambiguous chars in it, otherwise it will be too noisy
	return countChar != 0 && countNonASCII*100/countChar >= 50
}

func (e *escapeStreamer) analyzeDetectResults(results []detectResult) {
	for i := range results {
		res := &results[i]
		if res.runeType == runeTypeInvisible || res.runeType == runeTypeAmbiguous {
			leftIsNonASCII := i > 0 && (results[i-1].runeType == runeTypeNonASCII || results[i-1].runeType == runeTypeAmbiguous)
			rightIsNonASCII := i < len(results)-1 && (results[i+1].runeType == runeTypeNonASCII || results[i+1].runeType == runeTypeAmbiguous)
			surroundingNonASCII := leftIsNonASCII || rightIsNonASCII
			if !surroundingNonASCII {
				if len(results) < longSentenceDetectionLimit {
					res.needEscape = setting.UI.AmbiguousUnicodeDetection
				} else if !e.possibleLongSentence(results, i) {
					res.needEscape = setting.UI.AmbiguousUnicodeDetection
				}
			}
		}
	}
}

func (e *escapeStreamer) detectAndWriteRunes(part []byte) error {
	results := e.detectRunes(part)
	e.analyzeDetectResults(results)
	return e.writeDetectResults(part, results)
}

func (e *htmlChunkReader) readRunes() (parts [][]byte, partInTag []bool, _ error) {
	// we have read everything, eof
	if e.readErr != nil && len(e.readBuf) == 0 {
		return nil, nil, e.readErr
	}

	// not eof, and the there is space in the buffer, try to read more data
	if e.readErr == nil && len(e.readBuf) <= cap(e.readBuf)*3/4 {
		n, err := e.in.Read(e.readBuf[len(e.readBuf):cap(e.readBuf)])
		e.readErr = err
		e.readBuf = e.readBuf[:len(e.readBuf)+n]
	}
	if len(e.readBuf) == 0 {
		return nil, nil, e.readErr
	}

	// try to exact tag parts and content parts
	pos := 0
	for pos < len(e.readBuf) {
		var curPartEnd int
		nextInTag := e.curInTag
		if e.curInTag {
			// if cur part is in tag, try to find the tag close char '>'
			idx := bytes.IndexByte(e.readBuf[pos:], '>')
			if idx == -1 {
				// if no tag close char, then the whole buffer is in tag
				curPartEnd = len(e.readBuf)
			} else {
				// tag part ends, switch to content part
				curPartEnd = pos + idx + 1
				nextInTag = !nextInTag
			}
		} else {
			// if cur part is in content, try to find the tag open char '<'
			idx := bytes.IndexByte(e.readBuf[pos:], '<')
			if idx == -1 {
				// if no tag open char, then the whole buffer is in content
				curPartEnd = len(e.readBuf)
			} else {
				// content part ends, switch to tag part
				curPartEnd = pos + idx
				nextInTag = !nextInTag
			}
		}

		curPartLen := curPartEnd - pos
		if curPartLen == 0 {
			// if cur part is empty, only need to switch the part type
			if e.curInTag == nextInTag {
				panic("impossible, curPartLen is 0 but the part in tag status is not switched")
			}
			e.curInTag = nextInTag
			continue
		}

		// now, curPartLen can't be 0
		curPart := make([]byte, curPartLen)
		copy(curPart, e.readBuf[pos:curPartEnd])
		// now we get the curPart bytes, but we can't directly use it, the last rune in it might have been cut
		// try to decode the last rune, if it's invalid, then we cut the last byte and try again until we get a valid rune or no byte left
		for i := curPartLen - 1; i >= 0; i-- {
			last, lastSize := utf8.DecodeRune(curPart[i:])
			if last == utf8.RuneError && lastSize == 1 {
				curPartLen--
			} else {
				curPartLen += lastSize - 1
				break
			}
		}
		if curPartLen == 0 {
			// actually it's impossible that the part doesn't contain any valid rune,
			// the only case is that the cap(readBuf) is too small, or the origin contain indeed doesn't contain any valid rune
			// * try to leave the last 4 bytes (possible longest utf-8 encoding) to next round
			// * at least consume 1 byte to avoid infinite loop
			curPartLen = max(len(curPart)-utf8.UTFMax, 1)
		}

		// if curPartLen is not the same as curPart, it means we have cut some bytes,
		// need to wait for more data if not eof
		trailingCorrupted := curPartLen != len(curPart)

		// finally, we get the real part we need
		curPart = curPart[:curPartLen]
		parts = append(parts, curPart)
		partInTag = append(partInTag, e.curInTag)

		pos += curPartLen
		e.curInTag = nextInTag

		if trailingCorrupted && e.readErr == nil {
			// if the last part is corrupted, and we haven't reach eof, then we need to wait for more data to get the complete part
			break
		}
	}

	copy(e.readBuf, e.readBuf[pos:])
	e.readBuf = e.readBuf[:len(e.readBuf)-pos]
	return parts, partInTag, nil
}

func (e *escapeStreamer) writeDetectResults(data []byte, results []detectResult) error {
	lastWriteRawIdx := -1
	for idx := range results {
		res := &results[idx]
		if !res.needEscape {
			if lastWriteRawIdx == -1 {
				lastWriteRawIdx = idx
			}
			continue
		}

		if lastWriteRawIdx != -1 {
			if _, err := e.out.Write(data[results[lastWriteRawIdx].position:res.position]); err != nil {
				return err
			}
			lastWriteRawIdx = -1
		}
		switch res.runeType {
		case runeTypeBroken:
			if err := e.writeBrokenRune(data[res.position : res.position+res.runeSize]); err != nil {
				return err
			}
		case runeTypeAmbiguous:
			if err := e.writeAmbiguousRune(res.runeChar, res.confusable); err != nil {
				return err
			}
		case runeTypeInvisible:
			if err := e.writeInvisibleRune(res.runeChar); err != nil {
				return err
			}
		case runeTypeControlChar:
			if err := e.writeControlRune(res.runeChar); err != nil {
				return err
			}
		default:
			panic("unreachable")
		}
	}
	if lastWriteRawIdx != -1 {
		lastResult := results[len(results)-1]
		if _, err := e.out.Write(data[results[lastWriteRawIdx].position : lastResult.position+lastResult.runeSize]); err != nil {
			return err
		}
	}
	return nil
}

func (e *escapeStreamer) writeBrokenRune(_ []byte) (err error) {
	// Although we'd like to use the original bytes to display (show the real broken content to users),
	// however, when this "escape stream" module is applied to the content, the content has already been processed by other modules.
	// So the invalid bytes just can't be kept till this step, in most (all) cases, the only thing we see here is utf8.RuneError
	_, err = io.WriteString(e.out, `<span class="broken-code-point"><3E></span>`)
	return err
}

func (e *escapeStreamer) writeEscapedCharHTML(tag1, attr, tag2, content, tag3 string) (err error) {
	_, err = io.WriteString(e.out, tag1)
	if err != nil {
		return err
	}
	_, err = io.WriteString(e.out, html.EscapeString(attr))
	if err != nil {
		return err
	}
	_, err = io.WriteString(e.out, tag2)
	if err != nil {
		return err
	}
	_, err = io.WriteString(e.out, html.EscapeString(content))
	if err != nil {
		return err
	}
	_, err = io.WriteString(e.out, tag3)
	return err
}

func runeToHex(r rune) string {
	return fmt.Sprintf("[U+%04X]", r)
}

func (e *escapeStreamer) writeAmbiguousRune(r, c rune) (err error) {
	e.escaped.Escaped = true
	e.escaped.HasAmbiguous = true
	return e.writeEscapedCharHTML(
		`<span class="ambiguous-code-point" data-tooltip-content="`,
		e.locale.TrString("repo.ambiguous_character", string(r)+" "+runeToHex(r), string(c)+" "+runeToHex(c)),
		`"><span class="char">`,
		string(r),
		`</span></span>`,
	)
}

func (e *escapeStreamer) writeInvisibleRune(r rune) error {
	e.escaped.Escaped = true
	e.escaped.HasInvisible = true
	return e.writeEscapedCharHTML(
		`<span class="escaped-code-point" data-escaped="`,
		runeToHex(r),
		`"><span class="char">`,
		string(r),
		`</span></span>`,
	)
}

func (e *escapeStreamer) writeControlRune(r rune) error {
	var display string
	if r >= 0 && r <= 0x1f {
		display = string(0x2400 + r)
	} else if r == 0x7f {
		display = string(rune(0x2421))
	} else {
		display = runeToHex(r)
	}
	return e.writeEscapedCharHTML(
		`<span class="broken-code-point" data-escaped="`,
		display,
		`"><span class="char">`,
		string(r),
		`</span></span>`,
	)
}

type detectResult struct {
	runeChar   rune
	runeType   int
	runeSize   int
	position   int
	confusable rune
	needEscape bool
}

const (
	runeTypeBasic int = iota
	runeTypeBroken
	runeTypeNonASCII
	runeTypeAmbiguous
	runeTypeInvisible
	runeTypeControlChar
)

func (e *escapeStreamer) detectRunes(data []byte) []detectResult {
	runeCount := utf8.RuneCount(data)
	results := make([]detectResult, runeCount)
	invisibleRangeTable := globalVars().invisibleRangeTable
	var i int
	var confusable rune
	for pos := 0; pos < len(data); i++ {
		r, runeSize := utf8.DecodeRune(data[pos:])
		results[i].runeChar = r
		results[i].runeSize = runeSize
		results[i].position = pos
		pos += runeSize

		switch {
		case r == utf8.RuneError:
			results[i].runeType = runeTypeBroken
			results[i].needEscape = true
		case r == ' ' || r == '\t' || r == '\n' || e.allowed[r]:
			results[i].runeType = runeTypeBasic
			if r >= 0x80 {
				results[i].runeType = runeTypeNonASCII
			}
		case r < 0x20 || r == 0x7f:
			results[i].runeType = runeTypeControlChar
			results[i].needEscape = true
		case unicode.Is(invisibleRangeTable, r):
			results[i].runeType = runeTypeInvisible
			// not sure about results[i].needEscape, will be detected separately
		case isAmbiguous(r, &confusable, e.ambiguousTables...):
			results[i].runeType = runeTypeAmbiguous
			results[i].confusable = confusable
			// not sure about results[i].needEscape, will be detected separately
		case r >= 0x80:
			results[i].runeType = runeTypeNonASCII
		default: // details to basic runes
		}
	}
	return results
}