初始提交: Gitea 项目代码
This commit is contained in:
@@ -0,0 +1,418 @@
|
||||
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"html"
|
||||
"io"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/translation"
|
||||
)
|
||||
|
||||
type htmlChunkReader struct {
|
||||
in io.Reader
|
||||
readErr error
|
||||
readBuf []byte
|
||||
curInTag bool
|
||||
}
|
||||
|
||||
type escapeStreamer struct {
|
||||
htmlChunkReader
|
||||
|
||||
escaped *EscapeStatus
|
||||
locale translation.Locale
|
||||
ambiguousTables []*AmbiguousTable
|
||||
allowed map[rune]bool
|
||||
|
||||
out io.Writer
|
||||
}
|
||||
|
||||
func escapeStream(locale translation.Locale, in io.Reader, out io.Writer, opts ...EscapeOptions) (*EscapeStatus, error) {
|
||||
es := &escapeStreamer{
|
||||
escaped: &EscapeStatus{},
|
||||
locale: locale,
|
||||
ambiguousTables: AmbiguousTablesForLocale(locale),
|
||||
htmlChunkReader: htmlChunkReader{
|
||||
in: in,
|
||||
readBuf: make([]byte, 0, 32*1024),
|
||||
},
|
||||
out: out,
|
||||
}
|
||||
|
||||
if len(opts) > 0 {
|
||||
es.allowed = opts[0].Allowed
|
||||
}
|
||||
|
||||
readCount := 0
|
||||
lastIsTag := false
|
||||
for {
|
||||
parts, partInTag, err := es.readRunes()
|
||||
readCount++
|
||||
if err == io.EOF {
|
||||
return es.escaped, nil
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for i, part := range parts {
|
||||
if partInTag[i] {
|
||||
lastIsTag = true
|
||||
if _, err := out.Write(part); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
// if last part is tag, then this part is content begin
|
||||
// if the content is the first part of the first read, then it's also content begin
|
||||
isContentBegin := lastIsTag || (readCount == 1 && i == 0)
|
||||
lastIsTag = false
|
||||
if isContentBegin {
|
||||
if part, err = es.trimAndWriteBom(part); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if err = es.detectAndWriteRunes(part); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) trimAndWriteBom(part []byte) ([]byte, error) {
|
||||
remaining, ok := bytes.CutPrefix(part, globalVars().utf8Bom)
|
||||
if ok {
|
||||
part = remaining
|
||||
if _, err := e.out.Write(globalVars().utf8Bom); err != nil {
|
||||
return part, err
|
||||
}
|
||||
}
|
||||
return part, nil
|
||||
}
|
||||
|
||||
const longSentenceDetectionLimit = 20
|
||||
|
||||
func (e *escapeStreamer) possibleLongSentence(results []detectResult, pos int) bool {
|
||||
countBasic := 0
|
||||
countNonASCII := 0
|
||||
for i := max(pos-longSentenceDetectionLimit, 0); i < min(pos+longSentenceDetectionLimit, len(results)); i++ {
|
||||
if results[i].runeType == runeTypeBasic && results[i].runeChar != ' ' {
|
||||
countBasic++
|
||||
}
|
||||
if results[i].runeType == runeTypeNonASCII || results[i].runeType == runeTypeAmbiguous {
|
||||
countNonASCII++
|
||||
}
|
||||
}
|
||||
countChar := countBasic + countNonASCII
|
||||
// many non-ASCII runes around, it seems to be a sentence,
|
||||
// don't handle the invisible/ambiguous chars in it, otherwise it will be too noisy
|
||||
return countChar != 0 && countNonASCII*100/countChar >= 50
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) analyzeDetectResults(results []detectResult) {
|
||||
for i := range results {
|
||||
res := &results[i]
|
||||
if res.runeType == runeTypeInvisible || res.runeType == runeTypeAmbiguous {
|
||||
leftIsNonASCII := i > 0 && (results[i-1].runeType == runeTypeNonASCII || results[i-1].runeType == runeTypeAmbiguous)
|
||||
rightIsNonASCII := i < len(results)-1 && (results[i+1].runeType == runeTypeNonASCII || results[i+1].runeType == runeTypeAmbiguous)
|
||||
surroundingNonASCII := leftIsNonASCII || rightIsNonASCII
|
||||
if !surroundingNonASCII {
|
||||
if len(results) < longSentenceDetectionLimit {
|
||||
res.needEscape = setting.UI.AmbiguousUnicodeDetection
|
||||
} else if !e.possibleLongSentence(results, i) {
|
||||
res.needEscape = setting.UI.AmbiguousUnicodeDetection
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) detectAndWriteRunes(part []byte) error {
|
||||
results := e.detectRunes(part)
|
||||
e.analyzeDetectResults(results)
|
||||
return e.writeDetectResults(part, results)
|
||||
}
|
||||
|
||||
func (e *htmlChunkReader) readRunes() (parts [][]byte, partInTag []bool, _ error) {
|
||||
// we have read everything, eof
|
||||
if e.readErr != nil && len(e.readBuf) == 0 {
|
||||
return nil, nil, e.readErr
|
||||
}
|
||||
|
||||
// not eof, and the there is space in the buffer, try to read more data
|
||||
if e.readErr == nil && len(e.readBuf) <= cap(e.readBuf)*3/4 {
|
||||
n, err := e.in.Read(e.readBuf[len(e.readBuf):cap(e.readBuf)])
|
||||
e.readErr = err
|
||||
e.readBuf = e.readBuf[:len(e.readBuf)+n]
|
||||
}
|
||||
if len(e.readBuf) == 0 {
|
||||
return nil, nil, e.readErr
|
||||
}
|
||||
|
||||
// try to exact tag parts and content parts
|
||||
pos := 0
|
||||
for pos < len(e.readBuf) {
|
||||
var curPartEnd int
|
||||
nextInTag := e.curInTag
|
||||
if e.curInTag {
|
||||
// if cur part is in tag, try to find the tag close char '>'
|
||||
idx := bytes.IndexByte(e.readBuf[pos:], '>')
|
||||
if idx == -1 {
|
||||
// if no tag close char, then the whole buffer is in tag
|
||||
curPartEnd = len(e.readBuf)
|
||||
} else {
|
||||
// tag part ends, switch to content part
|
||||
curPartEnd = pos + idx + 1
|
||||
nextInTag = !nextInTag
|
||||
}
|
||||
} else {
|
||||
// if cur part is in content, try to find the tag open char '<'
|
||||
idx := bytes.IndexByte(e.readBuf[pos:], '<')
|
||||
if idx == -1 {
|
||||
// if no tag open char, then the whole buffer is in content
|
||||
curPartEnd = len(e.readBuf)
|
||||
} else {
|
||||
// content part ends, switch to tag part
|
||||
curPartEnd = pos + idx
|
||||
nextInTag = !nextInTag
|
||||
}
|
||||
}
|
||||
|
||||
curPartLen := curPartEnd - pos
|
||||
if curPartLen == 0 {
|
||||
// if cur part is empty, only need to switch the part type
|
||||
if e.curInTag == nextInTag {
|
||||
panic("impossible, curPartLen is 0 but the part in tag status is not switched")
|
||||
}
|
||||
e.curInTag = nextInTag
|
||||
continue
|
||||
}
|
||||
|
||||
// now, curPartLen can't be 0
|
||||
curPart := make([]byte, curPartLen)
|
||||
copy(curPart, e.readBuf[pos:curPartEnd])
|
||||
// now we get the curPart bytes, but we can't directly use it, the last rune in it might have been cut
|
||||
// try to decode the last rune, if it's invalid, then we cut the last byte and try again until we get a valid rune or no byte left
|
||||
for i := curPartLen - 1; i >= 0; i-- {
|
||||
last, lastSize := utf8.DecodeRune(curPart[i:])
|
||||
if last == utf8.RuneError && lastSize == 1 {
|
||||
curPartLen--
|
||||
} else {
|
||||
curPartLen += lastSize - 1
|
||||
break
|
||||
}
|
||||
}
|
||||
if curPartLen == 0 {
|
||||
// actually it's impossible that the part doesn't contain any valid rune,
|
||||
// the only case is that the cap(readBuf) is too small, or the origin contain indeed doesn't contain any valid rune
|
||||
// * try to leave the last 4 bytes (possible longest utf-8 encoding) to next round
|
||||
// * at least consume 1 byte to avoid infinite loop
|
||||
curPartLen = max(len(curPart)-utf8.UTFMax, 1)
|
||||
}
|
||||
|
||||
// if curPartLen is not the same as curPart, it means we have cut some bytes,
|
||||
// need to wait for more data if not eof
|
||||
trailingCorrupted := curPartLen != len(curPart)
|
||||
|
||||
// finally, we get the real part we need
|
||||
curPart = curPart[:curPartLen]
|
||||
parts = append(parts, curPart)
|
||||
partInTag = append(partInTag, e.curInTag)
|
||||
|
||||
pos += curPartLen
|
||||
e.curInTag = nextInTag
|
||||
|
||||
if trailingCorrupted && e.readErr == nil {
|
||||
// if the last part is corrupted, and we haven't reach eof, then we need to wait for more data to get the complete part
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
copy(e.readBuf, e.readBuf[pos:])
|
||||
e.readBuf = e.readBuf[:len(e.readBuf)-pos]
|
||||
return parts, partInTag, nil
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeDetectResults(data []byte, results []detectResult) error {
|
||||
lastWriteRawIdx := -1
|
||||
for idx := range results {
|
||||
res := &results[idx]
|
||||
if !res.needEscape {
|
||||
if lastWriteRawIdx == -1 {
|
||||
lastWriteRawIdx = idx
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if lastWriteRawIdx != -1 {
|
||||
if _, err := e.out.Write(data[results[lastWriteRawIdx].position:res.position]); err != nil {
|
||||
return err
|
||||
}
|
||||
lastWriteRawIdx = -1
|
||||
}
|
||||
switch res.runeType {
|
||||
case runeTypeBroken:
|
||||
if err := e.writeBrokenRune(data[res.position : res.position+res.runeSize]); err != nil {
|
||||
return err
|
||||
}
|
||||
case runeTypeAmbiguous:
|
||||
if err := e.writeAmbiguousRune(res.runeChar, res.confusable); err != nil {
|
||||
return err
|
||||
}
|
||||
case runeTypeInvisible:
|
||||
if err := e.writeInvisibleRune(res.runeChar); err != nil {
|
||||
return err
|
||||
}
|
||||
case runeTypeControlChar:
|
||||
if err := e.writeControlRune(res.runeChar); err != nil {
|
||||
return err
|
||||
}
|
||||
default:
|
||||
panic("unreachable")
|
||||
}
|
||||
}
|
||||
if lastWriteRawIdx != -1 {
|
||||
lastResult := results[len(results)-1]
|
||||
if _, err := e.out.Write(data[results[lastWriteRawIdx].position : lastResult.position+lastResult.runeSize]); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeBrokenRune(_ []byte) (err error) {
|
||||
// Although we'd like to use the original bytes to display (show the real broken content to users),
|
||||
// however, when this "escape stream" module is applied to the content, the content has already been processed by other modules.
|
||||
// So the invalid bytes just can't be kept till this step, in most (all) cases, the only thing we see here is utf8.RuneError
|
||||
_, err = io.WriteString(e.out, `<span class="broken-code-point">�</span>`)
|
||||
return err
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeEscapedCharHTML(tag1, attr, tag2, content, tag3 string) (err error) {
|
||||
_, err = io.WriteString(e.out, tag1)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = io.WriteString(e.out, html.EscapeString(attr))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = io.WriteString(e.out, tag2)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = io.WriteString(e.out, html.EscapeString(content))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = io.WriteString(e.out, tag3)
|
||||
return err
|
||||
}
|
||||
|
||||
func runeToHex(r rune) string {
|
||||
return fmt.Sprintf("[U+%04X]", r)
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeAmbiguousRune(r, c rune) (err error) {
|
||||
e.escaped.Escaped = true
|
||||
e.escaped.HasAmbiguous = true
|
||||
return e.writeEscapedCharHTML(
|
||||
`<span class="ambiguous-code-point" data-tooltip-content="`,
|
||||
e.locale.TrString("repo.ambiguous_character", string(r)+" "+runeToHex(r), string(c)+" "+runeToHex(c)),
|
||||
`"><span class="char">`,
|
||||
string(r),
|
||||
`</span></span>`,
|
||||
)
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeInvisibleRune(r rune) error {
|
||||
e.escaped.Escaped = true
|
||||
e.escaped.HasInvisible = true
|
||||
return e.writeEscapedCharHTML(
|
||||
`<span class="escaped-code-point" data-escaped="`,
|
||||
runeToHex(r),
|
||||
`"><span class="char">`,
|
||||
string(r),
|
||||
`</span></span>`,
|
||||
)
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeControlRune(r rune) error {
|
||||
var display string
|
||||
if r >= 0 && r <= 0x1f {
|
||||
display = string(0x2400 + r)
|
||||
} else if r == 0x7f {
|
||||
display = string(rune(0x2421))
|
||||
} else {
|
||||
display = runeToHex(r)
|
||||
}
|
||||
return e.writeEscapedCharHTML(
|
||||
`<span class="broken-code-point" data-escaped="`,
|
||||
display,
|
||||
`"><span class="char">`,
|
||||
string(r),
|
||||
`</span></span>`,
|
||||
)
|
||||
}
|
||||
|
||||
type detectResult struct {
|
||||
runeChar rune
|
||||
runeType int
|
||||
runeSize int
|
||||
position int
|
||||
confusable rune
|
||||
needEscape bool
|
||||
}
|
||||
|
||||
const (
|
||||
runeTypeBasic int = iota
|
||||
runeTypeBroken
|
||||
runeTypeNonASCII
|
||||
runeTypeAmbiguous
|
||||
runeTypeInvisible
|
||||
runeTypeControlChar
|
||||
)
|
||||
|
||||
func (e *escapeStreamer) detectRunes(data []byte) []detectResult {
|
||||
runeCount := utf8.RuneCount(data)
|
||||
results := make([]detectResult, runeCount)
|
||||
invisibleRangeTable := globalVars().invisibleRangeTable
|
||||
var i int
|
||||
var confusable rune
|
||||
for pos := 0; pos < len(data); i++ {
|
||||
r, runeSize := utf8.DecodeRune(data[pos:])
|
||||
results[i].runeChar = r
|
||||
results[i].runeSize = runeSize
|
||||
results[i].position = pos
|
||||
pos += runeSize
|
||||
|
||||
switch {
|
||||
case r == utf8.RuneError:
|
||||
results[i].runeType = runeTypeBroken
|
||||
results[i].needEscape = true
|
||||
case r == ' ' || r == '\t' || r == '\n' || e.allowed[r]:
|
||||
results[i].runeType = runeTypeBasic
|
||||
if r >= 0x80 {
|
||||
results[i].runeType = runeTypeNonASCII
|
||||
}
|
||||
case r < 0x20 || r == 0x7f:
|
||||
results[i].runeType = runeTypeControlChar
|
||||
results[i].needEscape = true
|
||||
case unicode.Is(invisibleRangeTable, r):
|
||||
results[i].runeType = runeTypeInvisible
|
||||
// not sure about results[i].needEscape, will be detected separately
|
||||
case isAmbiguous(r, &confusable, e.ambiguousTables...):
|
||||
results[i].runeType = runeTypeAmbiguous
|
||||
results[i].confusable = confusable
|
||||
// not sure about results[i].needEscape, will be detected separately
|
||||
case r >= 0x80:
|
||||
results[i].runeType = runeTypeNonASCII
|
||||
default: // details to basic runes
|
||||
}
|
||||
}
|
||||
return results
|
||||
}
|
||||
Reference in New Issue
Block a user