初始提交: Gitea 项目代码

This commit is contained in:
root
2026-05-30 22:47:36 +08:00
commit f288f76350
6116 changed files with 776822 additions and 0 deletions
+59
View File
@@ -0,0 +1,59 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import (
"sort"
"strings"
"unicode"
"gitea.dev/modules/translation"
)
// AmbiguousTablesForLocale provides the table of ambiguous characters for this locale.
func AmbiguousTablesForLocale(locale translation.Locale) []*AmbiguousTable {
ambiguousTableMap := globalVars().ambiguousTableMap
key := locale.Language()
var table *AmbiguousTable
var ok bool
for len(key) > 0 {
if table, ok = ambiguousTableMap[key]; ok {
break
}
idx := strings.LastIndexAny(key, "-_")
if idx < 0 {
key = ""
} else {
key = key[:idx]
}
}
if table == nil && (locale.Language() == "zh-CN" || locale.Language() == "zh_CN") {
table = ambiguousTableMap["zh-hans"]
}
if table == nil && strings.HasPrefix(locale.Language(), "zh") {
table = ambiguousTableMap["zh-hant"]
}
if table == nil {
table = ambiguousTableMap["_default"]
}
return []*AmbiguousTable{
table,
ambiguousTableMap["_common"],
}
}
func isAmbiguous(r rune, confusableTo *rune, tables ...*AmbiguousTable) bool {
for _, table := range tables {
if !unicode.Is(table.RangeTable, r) {
continue
}
i := sort.Search(len(table.Confusable), func(i int) bool {
return table.Confusable[i] >= r
})
*confusableTo = table.With[i]
return true
}
return false
}
File diff suppressed because one or more lines are too long
+37
View File
@@ -0,0 +1,37 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import (
"sort"
"testing"
"unicode"
"gitea.dev/modules/translation"
"github.com/stretchr/testify/assert"
)
func TestAmbiguousCharacters(t *testing.T) {
for locale, ambiguous := range globalVars().ambiguousTableMap {
assert.Equal(t, locale, ambiguous.Locale)
assert.Len(t, ambiguous.With, len(ambiguous.Confusable))
assert.True(t, sort.SliceIsSorted(ambiguous.Confusable, func(i, j int) bool {
return ambiguous.Confusable[i] < ambiguous.Confusable[j]
}))
for _, confusable := range ambiguous.Confusable {
assert.True(t, unicode.Is(ambiguous.RangeTable, confusable))
i := sort.Search(len(ambiguous.Confusable), func(j int) bool {
return ambiguous.Confusable[j] >= confusable
})
found := i < len(ambiguous.Confusable) && ambiguous.Confusable[i] == confusable
assert.True(t, found, "%c is not in %d", confusable, i)
}
}
var confusableTo rune
ret := isAmbiguous('𝐾', &confusableTo, AmbiguousTablesForLocale(&translation.MockLocale{})...)
assert.True(t, ret)
}
+210
View File
@@ -0,0 +1,210 @@
// Copyright 2014 The Gogs Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import (
"bytes"
"io"
"regexp"
"strings"
"sync"
"unicode"
"unicode/utf8"
"gitea.dev/modules/setting"
"gitea.dev/modules/util"
"github.com/gogs/chardet"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)
var globalVars = sync.OnceValue(func() (ret struct {
utf8Bom []byte
defaultWordRegexp *regexp.Regexp
ambiguousTableMap map[string]*AmbiguousTable
invisibleRangeTable *unicode.RangeTable
},
) {
ret.utf8Bom = []byte{'\xef', '\xbb', '\xbf'}
ret.ambiguousTableMap = newAmbiguousTableMap()
ret.invisibleRangeTable = newInvisibleRangeTable()
return ret
})
type ConvertOpts struct {
KeepBOM bool
ErrorReplacement []byte
ErrorReturnOrigin bool
}
var ToUTF8WithFallbackReaderPrefetchSize = 16 * 1024
// ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible
func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader {
buf := make([]byte, ToUTF8WithFallbackReaderPrefetchSize)
n, err := util.ReadAtMost(rd, buf)
if err != nil {
// read error occurs, don't do any processing
return io.MultiReader(bytes.NewReader(buf[:n]), rd)
}
charsetLabel, _ := DetectEncoding(buf[:n])
if charsetLabel == "UTF-8" {
// is utf-8, try to remove BOM and read it as-is
return io.MultiReader(bytes.NewReader(maybeRemoveBOM(buf[:n], opts)), rd)
}
encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
// unknown charset, don't do any processing
return io.MultiReader(bytes.NewReader(buf[:n]), rd)
}
// convert from charset to utf-8
return transform.NewReader(
io.MultiReader(bytes.NewReader(buf[:n]), rd),
encoding.NewDecoder(),
)
}
// ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible
func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte {
bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content), opts))
return bs
}
func ToUTF8DropErrors(content []byte) []byte {
return ToUTF8(content, ConvertOpts{ErrorReplacement: []byte{' '}})
}
func ToUTF8(content []byte, opts ConvertOpts) []byte {
charsetLabel, _ := DetectEncoding(content)
if charsetLabel == "UTF-8" {
return maybeRemoveBOM(content, opts)
}
encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
setting.PanicInDevOrTesting("unsupported detected charset %q, it shouldn't happen", charsetLabel)
if opts.ErrorReturnOrigin {
return content
}
return bytes.ToValidUTF8(content, opts.ErrorReplacement)
}
var decoded []byte
decoder := encoding.NewDecoder()
idx := 0
for idx < len(content) {
result, n, err := transform.Bytes(decoder, content[idx:])
decoded = append(decoded, result...)
if err == nil {
break
}
if opts.ErrorReturnOrigin {
return content
}
if opts.ErrorReplacement == nil {
decoded = append(decoded, content[idx+n])
} else {
decoded = append(decoded, opts.ErrorReplacement...)
}
idx += n + 1
}
return maybeRemoveBOM(decoded, opts)
}
// maybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
func maybeRemoveBOM(content []byte, opts ConvertOpts) []byte {
if opts.KeepBOM {
return content
}
return bytes.TrimPrefix(content, globalVars().utf8Bom)
}
// DetectEncoding detect the encoding of content
// it always returns a detected or guessed "encoding" string, no matter error happens or not
func DetectEncoding(content []byte) (encoding string, _ error) {
// First we check if the content represents valid utf8 content excepting a truncated character at the end.
// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
// instead we walk backwards from the end to trim off the incomplete character
toValidate := content
end := len(toValidate) - 1
// U+0000 U+007F 0yyyzzzz
// U+0080 U+07FF 110xxxyy 10yyzzzz
// U+0800 U+FFFF 1110wwww 10xxxxyy 10yyzzzz
// U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz
cnt := 0
for end >= 0 && cnt < 4 {
c := toValidate[end]
if c>>5 == 0b110 || c>>4 == 0b1110 || c>>3 == 0b11110 {
// a leading byte
toValidate = toValidate[:end]
break
} else if c>>6 == 0b10 {
// a continuation byte
end--
} else {
// not an utf-8 byte
break
}
cnt++
}
if utf8.Valid(toValidate) {
return "UTF-8", nil
}
textDetector := chardet.NewTextDetector()
var detectContent []byte
if len(content) < 1024 {
// Check if original content is valid
if _, err := textDetector.DetectBest(content); err != nil {
return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err
}
times := 1024 / len(content)
detectContent = make([]byte, 0, times*len(content))
for range times {
detectContent = append(detectContent, content...)
}
} else {
detectContent = content
}
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie-break
results, err := textDetector.DetectAll(detectContent)
if err != nil {
return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err
}
topConfidence := results[0].Confidence
topResult := results[0]
priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
for _, result := range results {
// As results are sorted in confidence order - if we have a different confidence
// we know it's less than the current confidence and can break out of the loop early
if result.Confidence != topConfidence {
break
}
// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guess
resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
if resultHas && (!has || resultPriority < priority) {
topResult = result
priority = resultPriority
has = true
}
}
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
if topResult.Charset != "UTF-8" && setting.Repository.AnsiCharset != "" {
return setting.Repository.AnsiCharset, err
}
return topResult.Charset, nil
}
+247
View File
@@ -0,0 +1,247 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import (
"io"
"os"
"strings"
"testing"
"gitea.dev/modules/setting"
"gitea.dev/modules/test"
"github.com/stretchr/testify/assert"
)
func TestMain(m *testing.M) {
setting.Repository.DetectedCharsetScore = map[string]int{}
for i, charset := range setting.Repository.DetectedCharsetsOrder {
setting.Repository.DetectedCharsetScore[strings.ToLower(charset)] = i
}
os.Exit(m.Run())
}
func TestMaybeRemoveBOM(t *testing.T) {
res := maybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
res = maybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
}
func TestToUTF8(t *testing.T) {
// Note: golang compiler seems so behave differently depending on the current
// locale, so some conversions might behave differently. For that reason, we don't
// depend on particular conversions but in expected behaviors.
res := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
assert.Equal(t, "ABC", string(res))
// "áéíóú"
res = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
// "áéíóú"
res = ToUTF8([]byte{
0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
0xc3, 0xba,
}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
res = ToUTF8([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
}, ConvertOpts{})
stringMustStartWith(t, "Hola,", res)
stringMustEndWith(t, "AAA.", res)
res = ToUTF8([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
}, ConvertOpts{})
stringMustStartWith(t, "Hola,", res)
stringMustEndWith(t, "AAA.", res)
res = ToUTF8([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
}, ConvertOpts{})
stringMustStartWith(t, "Hola,", res)
stringMustEndWith(t, "AAA.", res)
// Japanese (Shift-JIS)
// 日属秘ぞしちゅ。
res = ToUTF8([]byte{
0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
0xBF, 0x82, 0xE3, 0x81, 0x42,
}, ConvertOpts{})
assert.Equal(t, []byte{
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
}, res)
res = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
}
func TestToUTF8WithFallback(t *testing.T) {
// "ABC"
res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
// "áéíóú"
res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
// UTF8 BOM + "áéíóú"
res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
// "Hola, así cómo ños"
res = ToUTF8WithFallback([]byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73,
}, ConvertOpts{})
assert.Equal(t, []byte{
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63,
0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73,
}, res)
// "Hola, así cómo "
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])
res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])
// Japanese (Shift-JIS)
// "日属秘ぞしちゅ。"
res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{})
assert.Equal(t, []byte{
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
}, res)
res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
}
func TestToUTF8DropErrors(t *testing.T) {
// "ABC"
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
// "áéíóú"
res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
// UTF8 BOM + "áéíóú"
res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
// "Hola, así cómo ños"
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8])
assert.Equal(t, []byte{0x73}, res[len(res)-1:])
// "Hola, así cómo "
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])
// Japanese (Shift-JIS)
// "日属秘ぞしちゅ。"
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
assert.Equal(t, []byte{
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
}, res)
res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
}
func TestDetectEncoding(t *testing.T) {
testSuccess := func(b []byte, expected string) {
encoding, err := DetectEncoding(b)
assert.NoError(t, err)
assert.Equal(t, expected, encoding)
}
// invalid bytes
encoding, err := DetectEncoding([]byte{0xfa})
assert.Error(t, err)
assert.Equal(t, "UTF-8", encoding)
// utf-8
b := []byte("just some ascii")
testSuccess(b, "UTF-8")
// utf-8-sig: "hey" (with BOM)
b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79}
testSuccess(b, "UTF-8")
// utf-16: "hey<accented G>"
b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01}
testSuccess(b, "UTF-16LE")
// iso-8859-1: d<accented e>cor<newline>
b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
encoding, err = DetectEncoding(b)
assert.NoError(t, err)
assert.Contains(t, encoding, "ISO-8859-1")
defer test.MockVariableValue(&setting.Repository.AnsiCharset, "MyEncoding")()
testSuccess(b, "MyEncoding")
}
func stringMustStartWith(t *testing.T, expected string, value []byte) {
assert.Equal(t, expected, string(value[:len(expected)]))
}
func stringMustEndWith(t *testing.T, expected string, value []byte) {
assert.Equal(t, expected, string(value[len(value)-len(expected):]))
}
func TestToUTF8WithFallbackReader(t *testing.T) {
test.MockVariableValue(&ToUTF8WithFallbackReaderPrefetchSize)
block := "aá啊🤔"
runes := []rune(block)
assert.Len(t, string(runes[0]), 1)
assert.Len(t, string(runes[1]), 2)
assert.Len(t, string(runes[2]), 3)
assert.Len(t, string(runes[3]), 4)
content := strings.Repeat(block, 2)
for i := 1; i < len(content); i++ {
encoding, err := DetectEncoding([]byte(content[:i]))
assert.NoError(t, err)
assert.Equal(t, "UTF-8", encoding)
ToUTF8WithFallbackReaderPrefetchSize = i
rd := ToUTF8WithFallbackReader(strings.NewReader(content), ConvertOpts{})
r, _ := io.ReadAll(rd)
assert.Equal(t, content, string(r))
}
for _, r := range runes {
content = "abc abc " + string(r) + string(r) + string(r)
for i := 0; i < len(content); i++ {
encoding, err := DetectEncoding([]byte(content[:i]))
assert.NoError(t, err)
assert.Equal(t, "UTF-8", encoding)
}
}
}
+43
View File
@@ -0,0 +1,43 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import (
"html/template"
"io"
"strings"
"gitea.dev/modules/setting"
"gitea.dev/modules/translation"
)
type EscapeOptions struct {
Allowed map[rune]bool
}
func AllowRuneNBSP() map[rune]bool {
return map[rune]bool{0xa0: true}
}
func EscapeOptionsForView() EscapeOptions {
return EscapeOptions{
// it's safe to see NBSP in the view, but maybe not in the diff
Allowed: AllowRuneNBSP(),
}
}
// EscapeControlHTML escapes the Unicode control sequences in a provided html document
func EscapeControlHTML(html template.HTML, locale translation.Locale, opts ...EscapeOptions) (escaped *EscapeStatus, output template.HTML) {
if !setting.UI.AmbiguousUnicodeDetection {
return &EscapeStatus{}, html
}
sb := &strings.Builder{}
escaped, _ = EscapeControlReader(strings.NewReader(string(html)), sb, locale, opts...) // err has been handled in EscapeControlReader
return escaped, template.HTML(sb.String())
}
// EscapeControlReader escapes the Unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, opts ...EscapeOptions) (*EscapeStatus, error) {
return escapeStream(locale, reader, writer, opts...)
}
+23
View File
@@ -0,0 +1,23 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
// EscapeStatus represents the findings of the Unicode escaper
type EscapeStatus struct {
Escaped bool // it means that some characters were escaped, and they can also be unescaped back
HasInvisible bool
HasAmbiguous bool
}
// Or combines two EscapeStatus structs into one representing the conjunction of the two
func (status *EscapeStatus) Or(other *EscapeStatus) *EscapeStatus {
st := status
if status == nil {
st = &EscapeStatus{}
}
st.Escaped = st.Escaped || other.Escaped
st.HasAmbiguous = st.HasAmbiguous || other.HasAmbiguous
st.HasInvisible = st.HasInvisible || other.HasInvisible
return st
}
+418
View File
@@ -0,0 +1,418 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import (
"bytes"
"fmt"
"html"
"io"
"unicode"
"unicode/utf8"
"gitea.dev/modules/setting"
"gitea.dev/modules/translation"
)
type htmlChunkReader struct {
in io.Reader
readErr error
readBuf []byte
curInTag bool
}
type escapeStreamer struct {
htmlChunkReader
escaped *EscapeStatus
locale translation.Locale
ambiguousTables []*AmbiguousTable
allowed map[rune]bool
out io.Writer
}
func escapeStream(locale translation.Locale, in io.Reader, out io.Writer, opts ...EscapeOptions) (*EscapeStatus, error) {
es := &escapeStreamer{
escaped: &EscapeStatus{},
locale: locale,
ambiguousTables: AmbiguousTablesForLocale(locale),
htmlChunkReader: htmlChunkReader{
in: in,
readBuf: make([]byte, 0, 32*1024),
},
out: out,
}
if len(opts) > 0 {
es.allowed = opts[0].Allowed
}
readCount := 0
lastIsTag := false
for {
parts, partInTag, err := es.readRunes()
readCount++
if err == io.EOF {
return es.escaped, nil
} else if err != nil {
return nil, err
}
for i, part := range parts {
if partInTag[i] {
lastIsTag = true
if _, err := out.Write(part); err != nil {
return nil, err
}
} else {
// if last part is tag, then this part is content begin
// if the content is the first part of the first read, then it's also content begin
isContentBegin := lastIsTag || (readCount == 1 && i == 0)
lastIsTag = false
if isContentBegin {
if part, err = es.trimAndWriteBom(part); err != nil {
return nil, err
}
}
if err = es.detectAndWriteRunes(part); err != nil {
return nil, err
}
}
}
}
}
func (e *escapeStreamer) trimAndWriteBom(part []byte) ([]byte, error) {
remaining, ok := bytes.CutPrefix(part, globalVars().utf8Bom)
if ok {
part = remaining
if _, err := e.out.Write(globalVars().utf8Bom); err != nil {
return part, err
}
}
return part, nil
}
const longSentenceDetectionLimit = 20
func (e *escapeStreamer) possibleLongSentence(results []detectResult, pos int) bool {
countBasic := 0
countNonASCII := 0
for i := max(pos-longSentenceDetectionLimit, 0); i < min(pos+longSentenceDetectionLimit, len(results)); i++ {
if results[i].runeType == runeTypeBasic && results[i].runeChar != ' ' {
countBasic++
}
if results[i].runeType == runeTypeNonASCII || results[i].runeType == runeTypeAmbiguous {
countNonASCII++
}
}
countChar := countBasic + countNonASCII
// many non-ASCII runes around, it seems to be a sentence,
// don't handle the invisible/ambiguous chars in it, otherwise it will be too noisy
return countChar != 0 && countNonASCII*100/countChar >= 50
}
func (e *escapeStreamer) analyzeDetectResults(results []detectResult) {
for i := range results {
res := &results[i]
if res.runeType == runeTypeInvisible || res.runeType == runeTypeAmbiguous {
leftIsNonASCII := i > 0 && (results[i-1].runeType == runeTypeNonASCII || results[i-1].runeType == runeTypeAmbiguous)
rightIsNonASCII := i < len(results)-1 && (results[i+1].runeType == runeTypeNonASCII || results[i+1].runeType == runeTypeAmbiguous)
surroundingNonASCII := leftIsNonASCII || rightIsNonASCII
if !surroundingNonASCII {
if len(results) < longSentenceDetectionLimit {
res.needEscape = setting.UI.AmbiguousUnicodeDetection
} else if !e.possibleLongSentence(results, i) {
res.needEscape = setting.UI.AmbiguousUnicodeDetection
}
}
}
}
}
func (e *escapeStreamer) detectAndWriteRunes(part []byte) error {
results := e.detectRunes(part)
e.analyzeDetectResults(results)
return e.writeDetectResults(part, results)
}
func (e *htmlChunkReader) readRunes() (parts [][]byte, partInTag []bool, _ error) {
// we have read everything, eof
if e.readErr != nil && len(e.readBuf) == 0 {
return nil, nil, e.readErr
}
// not eof, and the there is space in the buffer, try to read more data
if e.readErr == nil && len(e.readBuf) <= cap(e.readBuf)*3/4 {
n, err := e.in.Read(e.readBuf[len(e.readBuf):cap(e.readBuf)])
e.readErr = err
e.readBuf = e.readBuf[:len(e.readBuf)+n]
}
if len(e.readBuf) == 0 {
return nil, nil, e.readErr
}
// try to exact tag parts and content parts
pos := 0
for pos < len(e.readBuf) {
var curPartEnd int
nextInTag := e.curInTag
if e.curInTag {
// if cur part is in tag, try to find the tag close char '>'
idx := bytes.IndexByte(e.readBuf[pos:], '>')
if idx == -1 {
// if no tag close char, then the whole buffer is in tag
curPartEnd = len(e.readBuf)
} else {
// tag part ends, switch to content part
curPartEnd = pos + idx + 1
nextInTag = !nextInTag
}
} else {
// if cur part is in content, try to find the tag open char '<'
idx := bytes.IndexByte(e.readBuf[pos:], '<')
if idx == -1 {
// if no tag open char, then the whole buffer is in content
curPartEnd = len(e.readBuf)
} else {
// content part ends, switch to tag part
curPartEnd = pos + idx
nextInTag = !nextInTag
}
}
curPartLen := curPartEnd - pos
if curPartLen == 0 {
// if cur part is empty, only need to switch the part type
if e.curInTag == nextInTag {
panic("impossible, curPartLen is 0 but the part in tag status is not switched")
}
e.curInTag = nextInTag
continue
}
// now, curPartLen can't be 0
curPart := make([]byte, curPartLen)
copy(curPart, e.readBuf[pos:curPartEnd])
// now we get the curPart bytes, but we can't directly use it, the last rune in it might have been cut
// try to decode the last rune, if it's invalid, then we cut the last byte and try again until we get a valid rune or no byte left
for i := curPartLen - 1; i >= 0; i-- {
last, lastSize := utf8.DecodeRune(curPart[i:])
if last == utf8.RuneError && lastSize == 1 {
curPartLen--
} else {
curPartLen += lastSize - 1
break
}
}
if curPartLen == 0 {
// actually it's impossible that the part doesn't contain any valid rune,
// the only case is that the cap(readBuf) is too small, or the origin contain indeed doesn't contain any valid rune
// * try to leave the last 4 bytes (possible longest utf-8 encoding) to next round
// * at least consume 1 byte to avoid infinite loop
curPartLen = max(len(curPart)-utf8.UTFMax, 1)
}
// if curPartLen is not the same as curPart, it means we have cut some bytes,
// need to wait for more data if not eof
trailingCorrupted := curPartLen != len(curPart)
// finally, we get the real part we need
curPart = curPart[:curPartLen]
parts = append(parts, curPart)
partInTag = append(partInTag, e.curInTag)
pos += curPartLen
e.curInTag = nextInTag
if trailingCorrupted && e.readErr == nil {
// if the last part is corrupted, and we haven't reach eof, then we need to wait for more data to get the complete part
break
}
}
copy(e.readBuf, e.readBuf[pos:])
e.readBuf = e.readBuf[:len(e.readBuf)-pos]
return parts, partInTag, nil
}
func (e *escapeStreamer) writeDetectResults(data []byte, results []detectResult) error {
lastWriteRawIdx := -1
for idx := range results {
res := &results[idx]
if !res.needEscape {
if lastWriteRawIdx == -1 {
lastWriteRawIdx = idx
}
continue
}
if lastWriteRawIdx != -1 {
if _, err := e.out.Write(data[results[lastWriteRawIdx].position:res.position]); err != nil {
return err
}
lastWriteRawIdx = -1
}
switch res.runeType {
case runeTypeBroken:
if err := e.writeBrokenRune(data[res.position : res.position+res.runeSize]); err != nil {
return err
}
case runeTypeAmbiguous:
if err := e.writeAmbiguousRune(res.runeChar, res.confusable); err != nil {
return err
}
case runeTypeInvisible:
if err := e.writeInvisibleRune(res.runeChar); err != nil {
return err
}
case runeTypeControlChar:
if err := e.writeControlRune(res.runeChar); err != nil {
return err
}
default:
panic("unreachable")
}
}
if lastWriteRawIdx != -1 {
lastResult := results[len(results)-1]
if _, err := e.out.Write(data[results[lastWriteRawIdx].position : lastResult.position+lastResult.runeSize]); err != nil {
return err
}
}
return nil
}
func (e *escapeStreamer) writeBrokenRune(_ []byte) (err error) {
// Although we'd like to use the original bytes to display (show the real broken content to users),
// however, when this "escape stream" module is applied to the content, the content has already been processed by other modules.
// So the invalid bytes just can't be kept till this step, in most (all) cases, the only thing we see here is utf8.RuneError
_, err = io.WriteString(e.out, `<span class="broken-code-point"></span>`)
return err
}
func (e *escapeStreamer) writeEscapedCharHTML(tag1, attr, tag2, content, tag3 string) (err error) {
_, err = io.WriteString(e.out, tag1)
if err != nil {
return err
}
_, err = io.WriteString(e.out, html.EscapeString(attr))
if err != nil {
return err
}
_, err = io.WriteString(e.out, tag2)
if err != nil {
return err
}
_, err = io.WriteString(e.out, html.EscapeString(content))
if err != nil {
return err
}
_, err = io.WriteString(e.out, tag3)
return err
}
func runeToHex(r rune) string {
return fmt.Sprintf("[U+%04X]", r)
}
func (e *escapeStreamer) writeAmbiguousRune(r, c rune) (err error) {
e.escaped.Escaped = true
e.escaped.HasAmbiguous = true
return e.writeEscapedCharHTML(
`<span class="ambiguous-code-point" data-tooltip-content="`,
e.locale.TrString("repo.ambiguous_character", string(r)+" "+runeToHex(r), string(c)+" "+runeToHex(c)),
`"><span class="char">`,
string(r),
`</span></span>`,
)
}
func (e *escapeStreamer) writeInvisibleRune(r rune) error {
e.escaped.Escaped = true
e.escaped.HasInvisible = true
return e.writeEscapedCharHTML(
`<span class="escaped-code-point" data-escaped="`,
runeToHex(r),
`"><span class="char">`,
string(r),
`</span></span>`,
)
}
func (e *escapeStreamer) writeControlRune(r rune) error {
var display string
if r >= 0 && r <= 0x1f {
display = string(0x2400 + r)
} else if r == 0x7f {
display = string(rune(0x2421))
} else {
display = runeToHex(r)
}
return e.writeEscapedCharHTML(
`<span class="broken-code-point" data-escaped="`,
display,
`"><span class="char">`,
string(r),
`</span></span>`,
)
}
type detectResult struct {
runeChar rune
runeType int
runeSize int
position int
confusable rune
needEscape bool
}
const (
runeTypeBasic int = iota
runeTypeBroken
runeTypeNonASCII
runeTypeAmbiguous
runeTypeInvisible
runeTypeControlChar
)
func (e *escapeStreamer) detectRunes(data []byte) []detectResult {
runeCount := utf8.RuneCount(data)
results := make([]detectResult, runeCount)
invisibleRangeTable := globalVars().invisibleRangeTable
var i int
var confusable rune
for pos := 0; pos < len(data); i++ {
r, runeSize := utf8.DecodeRune(data[pos:])
results[i].runeChar = r
results[i].runeSize = runeSize
results[i].position = pos
pos += runeSize
switch {
case r == utf8.RuneError:
results[i].runeType = runeTypeBroken
results[i].needEscape = true
case r == ' ' || r == '\t' || r == '\n' || e.allowed[r]:
results[i].runeType = runeTypeBasic
if r >= 0x80 {
results[i].runeType = runeTypeNonASCII
}
case r < 0x20 || r == 0x7f:
results[i].runeType = runeTypeControlChar
results[i].needEscape = true
case unicode.Is(invisibleRangeTable, r):
results[i].runeType = runeTypeInvisible
// not sure about results[i].needEscape, will be detected separately
case isAmbiguous(r, &confusable, e.ambiguousTables...):
results[i].runeType = runeTypeAmbiguous
results[i].confusable = confusable
// not sure about results[i].needEscape, will be detected separately
case r >= 0x80:
results[i].runeType = runeTypeNonASCII
default: // details to basic runes
}
}
return results
}
+212
View File
@@ -0,0 +1,212 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import (
"strings"
"testing"
"gitea.dev/modules/setting"
"gitea.dev/modules/test"
"gitea.dev/modules/translation"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type escapeControlTest struct {
name string
text string
status EscapeStatus
result string
}
var escapeControlTests = []escapeControlTest{
{
name: "<empty>",
},
{
name: "single line western",
text: "single line western",
result: "single line western",
status: EscapeStatus{},
},
{
name: "multi line western",
text: "single line western\nmulti line western\n",
result: "single line western\nmulti line western\n",
status: EscapeStatus{},
},
{
name: "multi line western non-breaking space",
text: "single line western\nmulti line western\n",
result: `single line<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>western` + "\n" + `multi line<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>western` + "\n",
status: EscapeStatus{Escaped: true, HasInvisible: true},
},
{
name: "mixed scripts: western + japanese",
text: "日属秘ぞしちゅ。Then some western.",
result: "日属秘ぞしちゅ。Then some western.",
status: EscapeStatus{},
},
{
name: "japanese",
text: "日属秘ぞしちゅ。",
result: "日属秘ぞしちゅ。",
status: EscapeStatus{},
},
{
name: "hebrew", // old test was wrong, such text shouldn't be escaped
text: "עד תקופת יוון העתיקה היה העיסוק במתמטיקה תכליתי בלבד: היא שימשה כאוסף של נוסחאות לחישוב קרקע, אוכלוסין וכו'. פריצת הדרך של היוונים, פרט לתרומותיהם הגדולות לידע המתמטי, הייתה בלימוד המתמטיקה כשלעצמה, מתוקף ערכה הרוחני. יחסם של חלק מהיוונים הקדמונים למתמטיקה היה דתי - למשל, הכת שאסף סביבו פיתגורס האמינה כי המתמטיקה היא הבסיס לכל הדברים. היוונים נחשבים ליוצרי מושג ההוכחה המתמטית, וכן לראשונים שעסקו במתמטיקה לשם עצמה, כלומר כתחום מחקרי עיוני ומופשט ולא רק כעזר שימושי. עם זאת, לצדה",
result: "עד תקופת יוון העתיקה היה העיסוק במתמטיקה תכליתי בלבד: היא שימשה כאוסף של נוסחאות לחישוב קרקע, אוכלוסין וכו'. פריצת הדרך של היוונים, פרט לתרומותיהם הגדולות לידע המתמטי, הייתה בלימוד המתמטיקה כשלעצמה, מתוקף ערכה הרוחני. יחסם של חלק מהיוונים הקדמונים למתמטיקה היה דתי - למשל, הכת שאסף סביבו פיתגורס האמינה כי המתמטיקה היא הבסיס לכל הדברים. היוונים נחשבים ליוצרי מושג ההוכחה המתמטית, וכן לראשונים שעסקו במתמטיקה לשם עצמה, כלומר כתחום מחקרי עיוני ומופשט ולא רק כעזר שימושי. עם זאת, לצדה",
status: EscapeStatus{},
},
{
name: "more hebrew", // old test was wrong, such text shouldn't be escaped
text: `בתקופה מאוחרת יותר, השתמשו היוונים בשיטת סימון מתקדמת יותר, שבה הוצגו המספרים לפי 22 אותיות האלפבית היווני. לסימון המספרים בין 1 ל-9 נקבעו תשע האותיות הראשונות, בתוספת גרש ( ' ) בצד ימין של האות, למעלה; תשע האותיות הבאות ייצגו את העשרות מ-10 עד 90, והבאות את המאות. לסימון הספרות בין 1000 ל-900,000, השתמשו היוונים באותן אותיות, אך הוסיפו לאותיות את הגרש דווקא מצד שמאל של האותיות, למטה. ממיליון ומעלה, כנראה השתמשו היוונים בשני תגים במקום אחד.
המתמטיקאי הבולט הראשון ביוון העתיקה, ויש האומרים בתולדות האנושות, הוא תאלס (624 לפנה"ס - 546 לפנה"ס בקירוב).[1] לא יהיה זה משולל יסוד להניח שהוא האדם הראשון שהוכיח משפט מתמטי, ולא רק גילה אותו. תאלס הוכיח שישרים מקבילים חותכים מצד אחד של שוקי זווית קטעים בעלי יחסים שווים (משפט תאלס הראשון), שהזווית המונחת על קוטר במעגל היא זווית ישרה (משפט תאלס השני), שהקוטר מחלק את המעגל לשני חלקים שווים, ושזוויות הבסיס במשולש שווה-שוקיים שוות זו לזו. מיוחסות לו גם שיטות למדידת גובהן של הפירמידות בעזרת מדידת צילן ולקביעת מיקומה של ספינה הנראית מן החוף.
בשנים 582 לפנה"ס עד 496 לפנה"ס, בקירוב, חי מתמטיקאי חשוב במיוחד - פיתגורס. המקורות הראשוניים עליו מועטים, וההיסטוריונים מתקשים להפריד את העובדות משכבת המסתורין והאגדות שנקשרו בו. ידוע שסביבו התקבצה האסכולה הפיתגוראית מעין כת פסבדו-מתמטית שהאמינה ש"הכל מספר", או ליתר דיוק הכל ניתן לכימות, וייחסה למספרים משמעויות מיסטיות. ככל הנראה הפיתגוראים ידעו לבנות את הגופים האפלטוניים, הכירו את הממוצע האריתמטי, הממוצע הגאומטרי והממוצע ההרמוני והגיעו להישגים חשובים נוספים. ניתן לומר שהפיתגוראים גילו את היותו של השורש הריבועי של 2, שהוא גם האלכסון בריבוע שאורך צלעותיו 1, אי רציונלי, אך תגליתם הייתה למעשה רק שהקטעים "חסרי מידה משותפת", ומושג המספר האי רציונלי מאוחר יותר.[2] אזכור ראשון לקיומם של קטעים חסרי מידה משותפת מופיע בדיאלוג "תאיטיטוס" של אפלטון, אך רעיון זה היה מוכר עוד קודם לכן, במאה החמישית לפנה"ס להיפאסוס, בן האסכולה הפיתגוראית, ואולי לפיתגורס עצמו.[3]`,
result: `בתקופה מאוחרת יותר, השתמשו היוונים בשיטת סימון מתקדמת יותר, שבה הוצגו המספרים לפי 22 אותיות האלפבית היווני. לסימון המספרים בין 1 ל-9 נקבעו תשע האותיות הראשונות, בתוספת גרש ( ' ) בצד ימין של האות, למעלה; תשע האותיות הבאות ייצגו את העשרות מ-10 עד 90, והבאות את המאות. לסימון הספרות בין 1000 ל-900,000, השתמשו היוונים באותן אותיות, אך הוסיפו לאותיות את הגרש דווקא מצד שמאל של האותיות, למטה. ממיליון ומעלה, כנראה השתמשו היוונים בשני תגים במקום אחד.
המתמטיקאי הבולט הראשון ביוון העתיקה, ויש האומרים בתולדות האנושות, הוא תאלס (624 לפנה"ס - 546 לפנה"ס בקירוב).[1] לא יהיה זה משולל יסוד להניח שהוא האדם הראשון שהוכיח משפט מתמטי, ולא רק גילה אותו. תאלס הוכיח שישרים מקבילים חותכים מצד אחד של שוקי זווית קטעים בעלי יחסים שווים (משפט תאלס הראשון), שהזווית המונחת על קוטר במעגל היא זווית ישרה (משפט תאלס השני), שהקוטר מחלק את המעגל לשני חלקים שווים, ושזוויות הבסיס במשולש שווה-שוקיים שוות זו לזו. מיוחסות לו גם שיטות למדידת גובהן של הפירמידות בעזרת מדידת צילן ולקביעת מיקומה של ספינה הנראית מן החוף.
בשנים 582 לפנה"ס עד 496 לפנה"ס, בקירוב, חי מתמטיקאי חשוב במיוחד - פיתגורס. המקורות הראשוניים עליו מועטים, וההיסטוריונים מתקשים להפריד את העובדות משכבת המסתורין והאגדות שנקשרו בו. ידוע שסביבו התקבצה האסכולה הפיתגוראית מעין כת פסבדו-מתמטית שהאמינה ש"הכל מספר", או ליתר דיוק הכל ניתן לכימות, וייחסה למספרים משמעויות מיסטיות. ככל הנראה הפיתגוראים ידעו לבנות את הגופים האפלטוניים, הכירו את הממוצע האריתמטי, הממוצע הגאומטרי והממוצע ההרמוני והגיעו להישגים חשובים נוספים. ניתן לומר שהפיתגוראים גילו את היותו של השורש הריבועי של 2, שהוא גם האלכסון בריבוע שאורך צלעותיו 1, אי רציונלי, אך תגליתם הייתה למעשה רק שהקטעים "חסרי מידה משותפת", ומושג המספר האי רציונלי מאוחר יותר.[2] אזכור ראשון לקיומם של קטעים חסרי מידה משותפת מופיע בדיאלוג "תאיטיטוס" של אפלטון, אך רעיון זה היה מוכר עוד קודם לכן, במאה החמישית לפנה"ס להיפאסוס, בן האסכולה הפיתגוראית, ואולי לפיתגורס עצמו.[3]`,
status: EscapeStatus{},
},
{
name: "Mixed RTL+LTR",
text: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah (שרה) is spelled: sin (ש) (which appears rightmost),
then resh (ר), and finally heh (ה) (which should appear leftmost).`,
result: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah (שרה) is spelled: sin (ש) (which appears rightmost),
then resh (ר), and finally heh (ה) (which should appear leftmost).`,
status: EscapeStatus{},
},
{
name: "Mixed RTL+LTR+BIDI",
text: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" +
`sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).`,
result: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" +
`sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).`,
status: EscapeStatus{},
},
{
name: "Accented characters",
text: string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}),
result: string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}),
status: EscapeStatus{},
},
{
name: "Program",
text: "string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})",
result: "string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})",
status: EscapeStatus{},
},
{
name: "CVE testcase",
text: "if access_level != \"user\u202E \u2066// Check if admin\u2069 \u2066\" {",
result: `if access_level != "user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>" {`,
status: EscapeStatus{Escaped: true, HasInvisible: true},
},
{
name: "Mixed testcase with fail",
text: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" +
`sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).` +
"\nif access_level != \"user\u202E \u2066// Check if admin\u2069 \u2066\" {\n",
result: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" +
`sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).` +
"\n" + `if access_level != "user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>" {` + "\n",
status: EscapeStatus{Escaped: true, HasInvisible: true},
},
{
// UTF-8/16/32 all use the same codepoint for BOM
// Gitea could read UTF-16/32 content and convert into UTF-8 internally then render it, so we only process UTF-8 internally
name: "UTF BOM",
text: "\xef\xbb\xbftest",
result: "\xef\xbb\xbftest",
status: EscapeStatus{},
},
{
name: "ambiguous",
text: "O𝐾",
result: `O<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character:𝐾 [U+1D43E],K [U+004B]"><span class="char">𝐾</span></span>`,
status: EscapeStatus{Escaped: true, HasAmbiguous: true},
},
}
func TestEscapeControlReader(t *testing.T) {
for _, tt := range escapeControlTests {
t.Run(tt.name, func(t *testing.T) {
output := &strings.Builder{}
status, err := EscapeControlReader(strings.NewReader(tt.text), output, &translation.MockLocale{})
assert.NoError(t, err)
assert.Equal(t, tt.status, *status)
outStr := output.String()
assert.Equal(t, tt.result, outStr)
})
}
}
func TestSettingAmbiguousUnicodeDetection(t *testing.T) {
defer test.MockVariableValue(&setting.UI.AmbiguousUnicodeDetection, true)()
_, out := EscapeControlHTML("a test", &translation.MockLocale{})
assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>test`, out)
setting.UI.AmbiguousUnicodeDetection = false
_, out = EscapeControlHTML("a test", &translation.MockLocale{})
assert.EqualValues(t, `a test`, out)
}
func TestHTMLChunkReader(t *testing.T) {
type textPart struct {
text string
isTag bool
}
testReadChunks := func(t *testing.T, chunkSize int, input string, expected []textPart) {
r := &htmlChunkReader{in: strings.NewReader(input), readBuf: make([]byte, 0, chunkSize)}
var results []textPart
for {
parts, partIsTag, err := r.readRunes()
if err != nil {
break
}
for i, part := range parts {
results = append(results, textPart{string(part), partIsTag[i]})
}
}
assert.Equal(t, expected, results, "chunk size: %d, input: %s", chunkSize, input)
}
testReadChunks(t, 10, "abc<def>ghi", []textPart{
{text: "abc", isTag: false},
{text: "<def>", isTag: true},
{text: "gh", isTag: false},
// -- chunk
{text: "i", isTag: false},
})
testReadChunks(t, 10, "<abc><def>ghi", []textPart{
{text: "<abc>", isTag: true},
{text: "<def>", isTag: true},
// -- chunk
{text: "ghi", isTag: false},
})
rune1, rune2, rune3, rune4 := "A", "é", "啊", "🌞"
require.Len(t, rune1, 1)
require.Len(t, rune2, 2)
require.Len(t, rune3, 3)
require.Len(t, rune4, 4)
input := "<" + rune1 + rune2 + rune3 + rune4 + ">" + rune1 + rune2 + rune3 + rune4
testReadChunks(t, 4, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞", true}, {">", true}, {"Aé", false}, {"啊", false}, {"🌞", false}})
testReadChunks(t, 5, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞>", true}, {"Aé", false}, {"啊", false}, {"🌞", false}})
testReadChunks(t, 6, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞>", true}, {"A", false}, {"é啊", false}, {"🌞", false}})
testReadChunks(t, 7, input, []textPart{{"<Aé啊", true}, {"🌞>", true}, {"A", false}, {"é啊", false}, {"🌞", false}})
}
File diff suppressed because one or more lines are too long
+201
View File
@@ -0,0 +1,201 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package main
import (
"bytes"
"fmt"
"go/format"
"log"
"os"
"sort"
"text/template"
"unicode"
"gitea.dev/modules/json"
"golang.org/x/text/unicode/rangetable"
)
// ambiguous.json provides a one to one mapping of ambiguous characters to other characters
// See https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
type AmbiguousTable struct {
Confusable []rune
With []rune
Locale string
RangeTable *unicode.RangeTable
}
type RunePair struct {
Confusable rune
With rune
}
// InvisibleRunes these are runes that vscode has assigned to be invisible
// See https://github.com/hediet/vscode-unicode-data
var InvisibleRunes = []rune{
9, 10, 11, 12, 13, 32, 127, 160, 173, 847, 1564, 4447, 4448, 6068, 6069, 6155, 6156, 6157, 6158, 7355, 7356, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8203, 8204, 8205, 8206, 8207, 8234, 8235, 8236, 8237, 8238, 8239, 8287, 8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299, 8300, 8301, 8302, 8303, 10240, 12288, 12644, 65024, 65025, 65026, 65027, 65028, 65029, 65030, 65031, 65032, 65033, 65034, 65035, 65036, 65037, 65038, 65039, 65279, 65440, 65520, 65521, 65522, 65523, 65524, 65525, 65526, 65527, 65528, 65532, 78844, 119155, 119156, 119157, 119158, 119159, 119160, 119161, 119162, 917504, 917505, 917506, 917507, 917508, 917509, 917510, 917511, 917512, 917513, 917514, 917515, 917516, 917517, 917518, 917519, 917520, 917521, 917522, 917523, 917524, 917525, 917526, 917527, 917528, 917529, 917530, 917531, 917532, 917533, 917534, 917535, 917536, 917537, 917538, 917539, 917540, 917541, 917542, 917543, 917544, 917545, 917546, 917547, 917548, 917549, 917550, 917551, 917552, 917553, 917554, 917555, 917556, 917557, 917558, 917559, 917560, 917561, 917562, 917563, 917564, 917565, 917566, 917567, 917568, 917569, 917570, 917571, 917572, 917573, 917574, 917575, 917576, 917577, 917578, 917579, 917580, 917581, 917582, 917583, 917584, 917585, 917586, 917587, 917588, 917589, 917590, 917591, 917592, 917593, 917594, 917595, 917596, 917597, 917598, 917599, 917600, 917601, 917602, 917603, 917604, 917605, 917606, 917607, 917608, 917609, 917610, 917611, 917612, 917613, 917614, 917615, 917616, 917617, 917618, 917619, 917620, 917621, 917622, 917623, 917624, 917625, 917626, 917627, 917628, 917629, 917630, 917631, 917760, 917761, 917762, 917763, 917764, 917765, 917766, 917767, 917768, 917769, 917770, 917771, 917772, 917773, 917774, 917775, 917776, 917777, 917778, 917779, 917780, 917781, 917782, 917783, 917784, 917785, 917786, 917787, 917788, 917789, 917790, 917791, 917792, 917793, 917794, 917795, 917796, 917797, 917798, 917799, 917800, 917801, 917802, 917803, 917804, 917805, 917806, 917807, 917808, 917809, 917810, 917811, 917812, 917813, 917814, 917815, 917816, 917817, 917818, 917819, 917820, 917821, 917822, 917823, 917824, 917825, 917826, 917827, 917828, 917829, 917830, 917831, 917832, 917833, 917834, 917835, 917836, 917837, 917838, 917839, 917840, 917841, 917842, 917843, 917844, 917845, 917846, 917847, 917848, 917849, 917850, 917851, 917852, 917853, 917854, 917855, 917856, 917857, 917858, 917859, 917860, 917861, 917862, 917863, 917864, 917865, 917866, 917867, 917868, 917869, 917870, 917871, 917872, 917873, 917874, 917875, 917876, 917877, 917878, 917879, 917880, 917881, 917882, 917883, 917884, 917885, 917886, 917887, 917888, 917889, 917890, 917891, 917892, 917893, 917894, 917895, 917896, 917897, 917898, 917899, 917900, 917901, 917902, 917903, 917904, 917905, 917906, 917907, 917908, 917909, 917910, 917911, 917912, 917913, 917914, 917915, 917916, 917917, 917918, 917919, 917920, 917921, 917922, 917923, 917924, 917925, 917926, 917927, 917928, 917929, 917930, 917931, 917932, 917933, 917934, 917935, 917936, 917937, 917938, 917939, 917940, 917941, 917942, 917943, 917944, 917945, 917946, 917947, 917948, 917949, 917950, 917951, 917952, 917953, 917954, 917955, 917956, 917957, 917958, 917959, 917960, 917961, 917962, 917963, 917964, 917965, 917966, 917967, 917968, 917969, 917970, 917971, 917972, 917973, 917974, 917975, 917976, 917977, 917978, 917979, 917980, 917981, 917982, 917983, 917984, 917985, 917986, 917987, 917988, 917989, 917990, 917991, 917992, 917993, 917994, 917995, 917996, 917997, 917998, 917999,
}
func generateAmbiguous() {
bs, err := os.ReadFile("ambiguous.json")
if err != nil {
log.Fatalf("Unable to read, err: %v", err)
}
var unwrapped string
if err := json.Unmarshal(bs, &unwrapped); err != nil {
log.Fatalf("Unable to unwrap content in, err: %v", err)
}
fromJSON := map[string][]uint32{}
if err := json.Unmarshal([]byte(unwrapped), &fromJSON); err != nil {
log.Fatalf("Unable to unmarshal content in, err: %v", err)
}
tables := make([]*AmbiguousTable, 0, len(fromJSON))
for locale, chars := range fromJSON {
table := &AmbiguousTable{Locale: locale}
table.Confusable = make([]rune, 0, len(chars)/2)
table.With = make([]rune, 0, len(chars)/2)
pairs := make([]RunePair, len(chars)/2)
for i := 0; i < len(chars); i += 2 {
pairs[i/2].Confusable, pairs[i/2].With = rune(chars[i]), rune(chars[i+1])
}
sort.Slice(pairs, func(i, j int) bool {
return pairs[i].Confusable < pairs[j].Confusable
})
for _, pair := range pairs {
table.Confusable = append(table.Confusable, pair.Confusable)
table.With = append(table.With, pair.With)
}
table.RangeTable = rangetable.New(table.Confusable...)
tables = append(tables, table)
}
sort.Slice(tables, func(i, j int) bool {
return tables[i].Locale < tables[j].Locale
})
data := map[string]any{"Tables": tables}
if err := runTemplate(templateAmbiguous, "../ambiguous_gen.go", &data); err != nil {
log.Fatalf("Unable to run template: %v", err)
}
}
func generateInvisible() {
// First we filter the runes to remove
// <space><tab><newline>
filtered := make([]rune, 0, len(InvisibleRunes))
for _, r := range InvisibleRunes {
if r == ' ' || r == '\t' || r == '\n' {
continue
}
filtered = append(filtered, r)
}
table := rangetable.New(filtered...)
if err := runTemplate(generatorInvisible, "../invisible_gen.go", table); err != nil {
log.Fatalf("Unable to run template: %v", err)
}
}
func runTemplate(t *template.Template, filename string, data any) error {
buf := bytes.NewBuffer(nil)
if err := t.Execute(buf, data); err != nil {
return fmt.Errorf("unable to execute template: %w", err)
}
bs, err := format.Source(buf.Bytes())
if err != nil {
log.Printf("Bad source:\n%s", buf.String())
return fmt.Errorf("unable to format source: %w", err)
}
old, err := os.ReadFile(filename)
if err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to read old file %s because %w", filename, err)
} else if err == nil {
if bytes.Equal(bs, old) {
// files are the same don't rewrite it.
return nil
}
}
file, err := os.Create(filename)
if err != nil {
return fmt.Errorf("failed to create file %s because %w", filename, err)
}
defer file.Close()
_, err = file.Write(bs)
if err != nil {
return fmt.Errorf("unable to write generated source: %w", err)
}
return nil
}
func main() {
generateAmbiguous()
generateInvisible()
}
var templateAmbiguous = template.Must(template.New("ambiguousTemplate").Parse(`// This file is generated by modules/charset/generate/generate.go DO NOT EDIT
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import "unicode"
// This file is generated from https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
// AmbiguousTable matches a confusable rune with its partner for the Locale
type AmbiguousTable struct {
Confusable []rune
With []rune
Locale string
RangeTable *unicode.RangeTable
}
func newAmbiguousTableMap() map[string]*AmbiguousTable {
return map[string]*AmbiguousTable {
{{- range .Tables}}
{{printf "%q" .Locale}}: {
Confusable: []rune{ {{range .Confusable}}{{.}},{{end}} },
With: []rune{ {{range .With}}{{.}},{{end}} },
Locale: {{printf "%q" .Locale}},
RangeTable: &unicode.RangeTable{
R16: []unicode.Range16{
{{range .RangeTable.R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
{{end}} },
R32: []unicode.Range32{
{{range .RangeTable.R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
{{end}} },
LatinOffset: {{.RangeTable.LatinOffset}},
},
},
{{end}}
}
}
`))
var generatorInvisible = template.Must(template.New("invisibleTemplate").Parse(`// This file is generated by modules/charset/generate/generate.go DO NOT EDIT
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import "unicode"
func newInvisibleRangeTable() *unicode.RangeTable {
return &unicode.RangeTable{
R16: []unicode.Range16{
{{range .R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
{{end}}},
R32: []unicode.Range32{
{{range .R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
{{end}}},
LatinOffset: {{.LatinOffset}},
}
}
`))
+38
View File
@@ -0,0 +1,38 @@
// This file is generated by modules/charset/generate/generate.go DO NOT EDIT
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import "unicode"
func newInvisibleRangeTable() *unicode.RangeTable {
return &unicode.RangeTable{
R16: []unicode.Range16{
{Lo: 11, Hi: 13, Stride: 1},
{Lo: 127, Hi: 160, Stride: 33},
{Lo: 173, Hi: 847, Stride: 674},
{Lo: 1564, Hi: 4447, Stride: 2883},
{Lo: 4448, Hi: 6068, Stride: 1620},
{Lo: 6069, Hi: 6155, Stride: 86},
{Lo: 6156, Hi: 6158, Stride: 1},
{Lo: 7355, Hi: 7356, Stride: 1},
{Lo: 8192, Hi: 8207, Stride: 1},
{Lo: 8234, Hi: 8239, Stride: 1},
{Lo: 8287, Hi: 8303, Stride: 1},
{Lo: 10240, Hi: 12288, Stride: 2048},
{Lo: 12644, Hi: 65024, Stride: 52380},
{Lo: 65025, Hi: 65039, Stride: 1},
{Lo: 65279, Hi: 65440, Stride: 161},
{Lo: 65520, Hi: 65528, Stride: 1},
{Lo: 65532, Hi: 65532, Stride: 1},
},
R32: []unicode.Range32{
{Lo: 78844, Hi: 119155, Stride: 40311},
{Lo: 119156, Hi: 119162, Stride: 1},
{Lo: 917504, Hi: 917631, Stride: 1},
{Lo: 917760, Hi: 917999, Stride: 1},
},
LatinOffset: 2,
}
}