mirror of
https://github.com/go-gitea/gitea.git
synced 2026-04-11 19:03:10 +09:00
Follow-up to #37078. - Use Unicode Control Pictures](U+2400-U+2421) to render C0 control characters - Make it work in diff view too - Replace escape warning emoji with SVG - Align escape warning button with code lines --------- Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
419 lines
12 KiB
Go
419 lines
12 KiB
Go
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||
// SPDX-License-Identifier: MIT
|
||
|
||
package charset
|
||
|
||
import (
|
||
"bytes"
|
||
"fmt"
|
||
"html"
|
||
"io"
|
||
"unicode"
|
||
"unicode/utf8"
|
||
|
||
"code.gitea.io/gitea/modules/setting"
|
||
"code.gitea.io/gitea/modules/translation"
|
||
)
|
||
|
||
type htmlChunkReader struct {
|
||
in io.Reader
|
||
readErr error
|
||
readBuf []byte
|
||
curInTag bool
|
||
}
|
||
|
||
type escapeStreamer struct {
|
||
htmlChunkReader
|
||
|
||
escaped *EscapeStatus
|
||
locale translation.Locale
|
||
ambiguousTables []*AmbiguousTable
|
||
allowed map[rune]bool
|
||
|
||
out io.Writer
|
||
}
|
||
|
||
func escapeStream(locale translation.Locale, in io.Reader, out io.Writer, opts ...EscapeOptions) (*EscapeStatus, error) {
|
||
es := &escapeStreamer{
|
||
escaped: &EscapeStatus{},
|
||
locale: locale,
|
||
ambiguousTables: AmbiguousTablesForLocale(locale),
|
||
htmlChunkReader: htmlChunkReader{
|
||
in: in,
|
||
readBuf: make([]byte, 0, 32*1024),
|
||
},
|
||
out: out,
|
||
}
|
||
|
||
if len(opts) > 0 {
|
||
es.allowed = opts[0].Allowed
|
||
}
|
||
|
||
readCount := 0
|
||
lastIsTag := false
|
||
for {
|
||
parts, partInTag, err := es.readRunes()
|
||
readCount++
|
||
if err == io.EOF {
|
||
return es.escaped, nil
|
||
} else if err != nil {
|
||
return nil, err
|
||
}
|
||
for i, part := range parts {
|
||
if partInTag[i] {
|
||
lastIsTag = true
|
||
if _, err := out.Write(part); err != nil {
|
||
return nil, err
|
||
}
|
||
} else {
|
||
// if last part is tag, then this part is content begin
|
||
// if the content is the first part of the first read, then it's also content begin
|
||
isContentBegin := lastIsTag || (readCount == 1 && i == 0)
|
||
lastIsTag = false
|
||
if isContentBegin {
|
||
if part, err = es.trimAndWriteBom(part); err != nil {
|
||
return nil, err
|
||
}
|
||
}
|
||
if err = es.detectAndWriteRunes(part); err != nil {
|
||
return nil, err
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
func (e *escapeStreamer) trimAndWriteBom(part []byte) ([]byte, error) {
|
||
remaining, ok := bytes.CutPrefix(part, globalVars().utf8Bom)
|
||
if ok {
|
||
part = remaining
|
||
if _, err := e.out.Write(globalVars().utf8Bom); err != nil {
|
||
return part, err
|
||
}
|
||
}
|
||
return part, nil
|
||
}
|
||
|
||
const longSentenceDetectionLimit = 20
|
||
|
||
func (e *escapeStreamer) possibleLongSentence(results []detectResult, pos int) bool {
|
||
countBasic := 0
|
||
countNonASCII := 0
|
||
for i := max(pos-longSentenceDetectionLimit, 0); i < min(pos+longSentenceDetectionLimit, len(results)); i++ {
|
||
if results[i].runeType == runeTypeBasic && results[i].runeChar != ' ' {
|
||
countBasic++
|
||
}
|
||
if results[i].runeType == runeTypeNonASCII || results[i].runeType == runeTypeAmbiguous {
|
||
countNonASCII++
|
||
}
|
||
}
|
||
countChar := countBasic + countNonASCII
|
||
// many non-ASCII runes around, it seems to be a sentence,
|
||
// don't handle the invisible/ambiguous chars in it, otherwise it will be too noisy
|
||
return countChar != 0 && countNonASCII*100/countChar >= 50
|
||
}
|
||
|
||
func (e *escapeStreamer) analyzeDetectResults(results []detectResult) {
|
||
for i := range results {
|
||
res := &results[i]
|
||
if res.runeType == runeTypeInvisible || res.runeType == runeTypeAmbiguous {
|
||
leftIsNonASCII := i > 0 && (results[i-1].runeType == runeTypeNonASCII || results[i-1].runeType == runeTypeAmbiguous)
|
||
rightIsNonASCII := i < len(results)-1 && (results[i+1].runeType == runeTypeNonASCII || results[i+1].runeType == runeTypeAmbiguous)
|
||
surroundingNonASCII := leftIsNonASCII || rightIsNonASCII
|
||
if !surroundingNonASCII {
|
||
if len(results) < longSentenceDetectionLimit {
|
||
res.needEscape = setting.UI.AmbiguousUnicodeDetection
|
||
} else if !e.possibleLongSentence(results, i) {
|
||
res.needEscape = setting.UI.AmbiguousUnicodeDetection
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
func (e *escapeStreamer) detectAndWriteRunes(part []byte) error {
|
||
results := e.detectRunes(part)
|
||
e.analyzeDetectResults(results)
|
||
return e.writeDetectResults(part, results)
|
||
}
|
||
|
||
func (e *htmlChunkReader) readRunes() (parts [][]byte, partInTag []bool, _ error) {
|
||
// we have read everything, eof
|
||
if e.readErr != nil && len(e.readBuf) == 0 {
|
||
return nil, nil, e.readErr
|
||
}
|
||
|
||
// not eof, and the there is space in the buffer, try to read more data
|
||
if e.readErr == nil && len(e.readBuf) <= cap(e.readBuf)*3/4 {
|
||
n, err := e.in.Read(e.readBuf[len(e.readBuf):cap(e.readBuf)])
|
||
e.readErr = err
|
||
e.readBuf = e.readBuf[:len(e.readBuf)+n]
|
||
}
|
||
if len(e.readBuf) == 0 {
|
||
return nil, nil, e.readErr
|
||
}
|
||
|
||
// try to exact tag parts and content parts
|
||
pos := 0
|
||
for pos < len(e.readBuf) {
|
||
var curPartEnd int
|
||
nextInTag := e.curInTag
|
||
if e.curInTag {
|
||
// if cur part is in tag, try to find the tag close char '>'
|
||
idx := bytes.IndexByte(e.readBuf[pos:], '>')
|
||
if idx == -1 {
|
||
// if no tag close char, then the whole buffer is in tag
|
||
curPartEnd = len(e.readBuf)
|
||
} else {
|
||
// tag part ends, switch to content part
|
||
curPartEnd = pos + idx + 1
|
||
nextInTag = !nextInTag
|
||
}
|
||
} else {
|
||
// if cur part is in content, try to find the tag open char '<'
|
||
idx := bytes.IndexByte(e.readBuf[pos:], '<')
|
||
if idx == -1 {
|
||
// if no tag open char, then the whole buffer is in content
|
||
curPartEnd = len(e.readBuf)
|
||
} else {
|
||
// content part ends, switch to tag part
|
||
curPartEnd = pos + idx
|
||
nextInTag = !nextInTag
|
||
}
|
||
}
|
||
|
||
curPartLen := curPartEnd - pos
|
||
if curPartLen == 0 {
|
||
// if cur part is empty, only need to switch the part type
|
||
if e.curInTag == nextInTag {
|
||
panic("impossible, curPartLen is 0 but the part in tag status is not switched")
|
||
}
|
||
e.curInTag = nextInTag
|
||
continue
|
||
}
|
||
|
||
// now, curPartLen can't be 0
|
||
curPart := make([]byte, curPartLen)
|
||
copy(curPart, e.readBuf[pos:curPartEnd])
|
||
// now we get the curPart bytes, but we can't directly use it, the last rune in it might have been cut
|
||
// try to decode the last rune, if it's invalid, then we cut the last byte and try again until we get a valid rune or no byte left
|
||
for i := curPartLen - 1; i >= 0; i-- {
|
||
last, lastSize := utf8.DecodeRune(curPart[i:])
|
||
if last == utf8.RuneError && lastSize == 1 {
|
||
curPartLen--
|
||
} else {
|
||
curPartLen += lastSize - 1
|
||
break
|
||
}
|
||
}
|
||
if curPartLen == 0 {
|
||
// actually it's impossible that the part doesn't contain any valid rune,
|
||
// the only case is that the cap(readBuf) is too small, or the origin contain indeed doesn't contain any valid rune
|
||
// * try to leave the last 4 bytes (possible longest utf-8 encoding) to next round
|
||
// * at least consume 1 byte to avoid infinite loop
|
||
curPartLen = max(len(curPart)-utf8.UTFMax, 1)
|
||
}
|
||
|
||
// if curPartLen is not the same as curPart, it means we have cut some bytes,
|
||
// need to wait for more data if not eof
|
||
trailingCorrupted := curPartLen != len(curPart)
|
||
|
||
// finally, we get the real part we need
|
||
curPart = curPart[:curPartLen]
|
||
parts = append(parts, curPart)
|
||
partInTag = append(partInTag, e.curInTag)
|
||
|
||
pos += curPartLen
|
||
e.curInTag = nextInTag
|
||
|
||
if trailingCorrupted && e.readErr == nil {
|
||
// if the last part is corrupted, and we haven't reach eof, then we need to wait for more data to get the complete part
|
||
break
|
||
}
|
||
}
|
||
|
||
copy(e.readBuf, e.readBuf[pos:])
|
||
e.readBuf = e.readBuf[:len(e.readBuf)-pos]
|
||
return parts, partInTag, nil
|
||
}
|
||
|
||
func (e *escapeStreamer) writeDetectResults(data []byte, results []detectResult) error {
|
||
lastWriteRawIdx := -1
|
||
for idx := range results {
|
||
res := &results[idx]
|
||
if !res.needEscape {
|
||
if lastWriteRawIdx == -1 {
|
||
lastWriteRawIdx = idx
|
||
}
|
||
continue
|
||
}
|
||
|
||
if lastWriteRawIdx != -1 {
|
||
if _, err := e.out.Write(data[results[lastWriteRawIdx].position:res.position]); err != nil {
|
||
return err
|
||
}
|
||
lastWriteRawIdx = -1
|
||
}
|
||
switch res.runeType {
|
||
case runeTypeBroken:
|
||
if err := e.writeBrokenRune(data[res.position : res.position+res.runeSize]); err != nil {
|
||
return err
|
||
}
|
||
case runeTypeAmbiguous:
|
||
if err := e.writeAmbiguousRune(res.runeChar, res.confusable); err != nil {
|
||
return err
|
||
}
|
||
case runeTypeInvisible:
|
||
if err := e.writeInvisibleRune(res.runeChar); err != nil {
|
||
return err
|
||
}
|
||
case runeTypeControlChar:
|
||
if err := e.writeControlRune(res.runeChar); err != nil {
|
||
return err
|
||
}
|
||
default:
|
||
panic("unreachable")
|
||
}
|
||
}
|
||
if lastWriteRawIdx != -1 {
|
||
lastResult := results[len(results)-1]
|
||
if _, err := e.out.Write(data[results[lastWriteRawIdx].position : lastResult.position+lastResult.runeSize]); err != nil {
|
||
return err
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (e *escapeStreamer) writeBrokenRune(_ []byte) (err error) {
|
||
// Although we'd like to use the original bytes to display (show the real broken content to users),
|
||
// however, when this "escape stream" module is applied to the content, the content has already been processed by other modules.
|
||
// So the invalid bytes just can't be kept till this step, in most (all) cases, the only thing we see here is utf8.RuneError
|
||
_, err = io.WriteString(e.out, `<span class="broken-code-point"><3E></span>`)
|
||
return err
|
||
}
|
||
|
||
func (e *escapeStreamer) writeEscapedCharHTML(tag1, attr, tag2, content, tag3 string) (err error) {
|
||
_, err = io.WriteString(e.out, tag1)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
_, err = io.WriteString(e.out, html.EscapeString(attr))
|
||
if err != nil {
|
||
return err
|
||
}
|
||
_, err = io.WriteString(e.out, tag2)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
_, err = io.WriteString(e.out, html.EscapeString(content))
|
||
if err != nil {
|
||
return err
|
||
}
|
||
_, err = io.WriteString(e.out, tag3)
|
||
return err
|
||
}
|
||
|
||
func runeToHex(r rune) string {
|
||
return fmt.Sprintf("[U+%04X]", r)
|
||
}
|
||
|
||
func (e *escapeStreamer) writeAmbiguousRune(r, c rune) (err error) {
|
||
e.escaped.Escaped = true
|
||
e.escaped.HasAmbiguous = true
|
||
return e.writeEscapedCharHTML(
|
||
`<span class="ambiguous-code-point" data-tooltip-content="`,
|
||
e.locale.TrString("repo.ambiguous_character", string(r)+" "+runeToHex(r), string(c)+" "+runeToHex(c)),
|
||
`"><span class="char">`,
|
||
string(r),
|
||
`</span></span>`,
|
||
)
|
||
}
|
||
|
||
func (e *escapeStreamer) writeInvisibleRune(r rune) error {
|
||
e.escaped.Escaped = true
|
||
e.escaped.HasInvisible = true
|
||
return e.writeEscapedCharHTML(
|
||
`<span class="escaped-code-point" data-escaped="`,
|
||
runeToHex(r),
|
||
`"><span class="char">`,
|
||
string(r),
|
||
`</span></span>`,
|
||
)
|
||
}
|
||
|
||
func (e *escapeStreamer) writeControlRune(r rune) error {
|
||
var display string
|
||
if r >= 0 && r <= 0x1f {
|
||
display = string(0x2400 + r)
|
||
} else if r == 0x7f {
|
||
display = string(rune(0x2421))
|
||
} else {
|
||
display = runeToHex(r)
|
||
}
|
||
return e.writeEscapedCharHTML(
|
||
`<span class="broken-code-point" data-escaped="`,
|
||
display,
|
||
`"><span class="char">`,
|
||
string(r),
|
||
`</span></span>`,
|
||
)
|
||
}
|
||
|
||
type detectResult struct {
|
||
runeChar rune
|
||
runeType int
|
||
runeSize int
|
||
position int
|
||
confusable rune
|
||
needEscape bool
|
||
}
|
||
|
||
const (
|
||
runeTypeBasic int = iota
|
||
runeTypeBroken
|
||
runeTypeNonASCII
|
||
runeTypeAmbiguous
|
||
runeTypeInvisible
|
||
runeTypeControlChar
|
||
)
|
||
|
||
func (e *escapeStreamer) detectRunes(data []byte) []detectResult {
|
||
runeCount := utf8.RuneCount(data)
|
||
results := make([]detectResult, runeCount)
|
||
invisibleRangeTable := globalVars().invisibleRangeTable
|
||
var i int
|
||
var confusable rune
|
||
for pos := 0; pos < len(data); i++ {
|
||
r, runeSize := utf8.DecodeRune(data[pos:])
|
||
results[i].runeChar = r
|
||
results[i].runeSize = runeSize
|
||
results[i].position = pos
|
||
pos += runeSize
|
||
|
||
switch {
|
||
case r == utf8.RuneError:
|
||
results[i].runeType = runeTypeBroken
|
||
results[i].needEscape = true
|
||
case r == ' ' || r == '\t' || r == '\n' || e.allowed[r]:
|
||
results[i].runeType = runeTypeBasic
|
||
if r >= 0x80 {
|
||
results[i].runeType = runeTypeNonASCII
|
||
}
|
||
case r < 0x20 || r == 0x7f:
|
||
results[i].runeType = runeTypeControlChar
|
||
results[i].needEscape = true
|
||
case unicode.Is(invisibleRangeTable, r):
|
||
results[i].runeType = runeTypeInvisible
|
||
// not sure about results[i].needEscape, will be detected separately
|
||
case isAmbiguous(r, &confusable, e.ambiguousTables...):
|
||
results[i].runeType = runeTypeAmbiguous
|
||
results[i].confusable = confusable
|
||
// not sure about results[i].needEscape, will be detected separately
|
||
case r >= 0x80:
|
||
results[i].runeType = runeTypeNonASCII
|
||
default: // details to basic runes
|
||
}
|
||
}
|
||
return results
|
||
}
|