初始提交: Gitea 项目代码
This commit is contained in:
@@ -0,0 +1,59 @@
|
||||
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"gitea.dev/modules/translation"
|
||||
)
|
||||
|
||||
// AmbiguousTablesForLocale provides the table of ambiguous characters for this locale.
|
||||
func AmbiguousTablesForLocale(locale translation.Locale) []*AmbiguousTable {
|
||||
ambiguousTableMap := globalVars().ambiguousTableMap
|
||||
key := locale.Language()
|
||||
var table *AmbiguousTable
|
||||
var ok bool
|
||||
for len(key) > 0 {
|
||||
if table, ok = ambiguousTableMap[key]; ok {
|
||||
break
|
||||
}
|
||||
idx := strings.LastIndexAny(key, "-_")
|
||||
if idx < 0 {
|
||||
key = ""
|
||||
} else {
|
||||
key = key[:idx]
|
||||
}
|
||||
}
|
||||
if table == nil && (locale.Language() == "zh-CN" || locale.Language() == "zh_CN") {
|
||||
table = ambiguousTableMap["zh-hans"]
|
||||
}
|
||||
if table == nil && strings.HasPrefix(locale.Language(), "zh") {
|
||||
table = ambiguousTableMap["zh-hant"]
|
||||
}
|
||||
if table == nil {
|
||||
table = ambiguousTableMap["_default"]
|
||||
}
|
||||
|
||||
return []*AmbiguousTable{
|
||||
table,
|
||||
ambiguousTableMap["_common"],
|
||||
}
|
||||
}
|
||||
|
||||
func isAmbiguous(r rune, confusableTo *rune, tables ...*AmbiguousTable) bool {
|
||||
for _, table := range tables {
|
||||
if !unicode.Is(table.RangeTable, r) {
|
||||
continue
|
||||
}
|
||||
i := sort.Search(len(table.Confusable), func(i int) bool {
|
||||
return table.Confusable[i] >= r
|
||||
})
|
||||
*confusableTo = table.With[i]
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,37 @@
|
||||
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"testing"
|
||||
"unicode"
|
||||
|
||||
"gitea.dev/modules/translation"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestAmbiguousCharacters(t *testing.T) {
|
||||
for locale, ambiguous := range globalVars().ambiguousTableMap {
|
||||
assert.Equal(t, locale, ambiguous.Locale)
|
||||
assert.Len(t, ambiguous.With, len(ambiguous.Confusable))
|
||||
assert.True(t, sort.SliceIsSorted(ambiguous.Confusable, func(i, j int) bool {
|
||||
return ambiguous.Confusable[i] < ambiguous.Confusable[j]
|
||||
}))
|
||||
|
||||
for _, confusable := range ambiguous.Confusable {
|
||||
assert.True(t, unicode.Is(ambiguous.RangeTable, confusable))
|
||||
i := sort.Search(len(ambiguous.Confusable), func(j int) bool {
|
||||
return ambiguous.Confusable[j] >= confusable
|
||||
})
|
||||
found := i < len(ambiguous.Confusable) && ambiguous.Confusable[i] == confusable
|
||||
assert.True(t, found, "%c is not in %d", confusable, i)
|
||||
}
|
||||
}
|
||||
|
||||
var confusableTo rune
|
||||
ret := isAmbiguous('𝐾', &confusableTo, AmbiguousTablesForLocale(&translation.MockLocale{})...)
|
||||
assert.True(t, ret)
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
// Copyright 2014 The Gogs Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/util"
|
||||
|
||||
"github.com/gogs/chardet"
|
||||
"golang.org/x/net/html/charset"
|
||||
"golang.org/x/text/transform"
|
||||
)
|
||||
|
||||
var globalVars = sync.OnceValue(func() (ret struct {
|
||||
utf8Bom []byte
|
||||
|
||||
defaultWordRegexp *regexp.Regexp
|
||||
ambiguousTableMap map[string]*AmbiguousTable
|
||||
invisibleRangeTable *unicode.RangeTable
|
||||
},
|
||||
) {
|
||||
ret.utf8Bom = []byte{'\xef', '\xbb', '\xbf'}
|
||||
ret.ambiguousTableMap = newAmbiguousTableMap()
|
||||
ret.invisibleRangeTable = newInvisibleRangeTable()
|
||||
return ret
|
||||
})
|
||||
|
||||
type ConvertOpts struct {
|
||||
KeepBOM bool
|
||||
ErrorReplacement []byte
|
||||
ErrorReturnOrigin bool
|
||||
}
|
||||
|
||||
var ToUTF8WithFallbackReaderPrefetchSize = 16 * 1024
|
||||
|
||||
// ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible
|
||||
func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader {
|
||||
buf := make([]byte, ToUTF8WithFallbackReaderPrefetchSize)
|
||||
n, err := util.ReadAtMost(rd, buf)
|
||||
if err != nil {
|
||||
// read error occurs, don't do any processing
|
||||
return io.MultiReader(bytes.NewReader(buf[:n]), rd)
|
||||
}
|
||||
|
||||
charsetLabel, _ := DetectEncoding(buf[:n])
|
||||
if charsetLabel == "UTF-8" {
|
||||
// is utf-8, try to remove BOM and read it as-is
|
||||
return io.MultiReader(bytes.NewReader(maybeRemoveBOM(buf[:n], opts)), rd)
|
||||
}
|
||||
|
||||
encoding, _ := charset.Lookup(charsetLabel)
|
||||
if encoding == nil {
|
||||
// unknown charset, don't do any processing
|
||||
return io.MultiReader(bytes.NewReader(buf[:n]), rd)
|
||||
}
|
||||
|
||||
// convert from charset to utf-8
|
||||
return transform.NewReader(
|
||||
io.MultiReader(bytes.NewReader(buf[:n]), rd),
|
||||
encoding.NewDecoder(),
|
||||
)
|
||||
}
|
||||
|
||||
// ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible
|
||||
func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte {
|
||||
bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content), opts))
|
||||
return bs
|
||||
}
|
||||
|
||||
func ToUTF8DropErrors(content []byte) []byte {
|
||||
return ToUTF8(content, ConvertOpts{ErrorReplacement: []byte{' '}})
|
||||
}
|
||||
|
||||
func ToUTF8(content []byte, opts ConvertOpts) []byte {
|
||||
charsetLabel, _ := DetectEncoding(content)
|
||||
if charsetLabel == "UTF-8" {
|
||||
return maybeRemoveBOM(content, opts)
|
||||
}
|
||||
|
||||
encoding, _ := charset.Lookup(charsetLabel)
|
||||
if encoding == nil {
|
||||
setting.PanicInDevOrTesting("unsupported detected charset %q, it shouldn't happen", charsetLabel)
|
||||
if opts.ErrorReturnOrigin {
|
||||
return content
|
||||
}
|
||||
return bytes.ToValidUTF8(content, opts.ErrorReplacement)
|
||||
}
|
||||
|
||||
var decoded []byte
|
||||
decoder := encoding.NewDecoder()
|
||||
idx := 0
|
||||
for idx < len(content) {
|
||||
result, n, err := transform.Bytes(decoder, content[idx:])
|
||||
decoded = append(decoded, result...)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
if opts.ErrorReturnOrigin {
|
||||
return content
|
||||
}
|
||||
if opts.ErrorReplacement == nil {
|
||||
decoded = append(decoded, content[idx+n])
|
||||
} else {
|
||||
decoded = append(decoded, opts.ErrorReplacement...)
|
||||
}
|
||||
idx += n + 1
|
||||
}
|
||||
return maybeRemoveBOM(decoded, opts)
|
||||
}
|
||||
|
||||
// maybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
|
||||
func maybeRemoveBOM(content []byte, opts ConvertOpts) []byte {
|
||||
if opts.KeepBOM {
|
||||
return content
|
||||
}
|
||||
return bytes.TrimPrefix(content, globalVars().utf8Bom)
|
||||
}
|
||||
|
||||
// DetectEncoding detect the encoding of content
|
||||
// it always returns a detected or guessed "encoding" string, no matter error happens or not
|
||||
func DetectEncoding(content []byte) (encoding string, _ error) {
|
||||
// First we check if the content represents valid utf8 content excepting a truncated character at the end.
|
||||
|
||||
// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
|
||||
// instead we walk backwards from the end to trim off the incomplete character
|
||||
toValidate := content
|
||||
end := len(toValidate) - 1
|
||||
|
||||
// U+0000 U+007F 0yyyzzzz
|
||||
// U+0080 U+07FF 110xxxyy 10yyzzzz
|
||||
// U+0800 U+FFFF 1110wwww 10xxxxyy 10yyzzzz
|
||||
// U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz
|
||||
cnt := 0
|
||||
for end >= 0 && cnt < 4 {
|
||||
c := toValidate[end]
|
||||
if c>>5 == 0b110 || c>>4 == 0b1110 || c>>3 == 0b11110 {
|
||||
// a leading byte
|
||||
toValidate = toValidate[:end]
|
||||
break
|
||||
} else if c>>6 == 0b10 {
|
||||
// a continuation byte
|
||||
end--
|
||||
} else {
|
||||
// not an utf-8 byte
|
||||
break
|
||||
}
|
||||
cnt++
|
||||
}
|
||||
|
||||
if utf8.Valid(toValidate) {
|
||||
return "UTF-8", nil
|
||||
}
|
||||
|
||||
textDetector := chardet.NewTextDetector()
|
||||
var detectContent []byte
|
||||
if len(content) < 1024 {
|
||||
// Check if original content is valid
|
||||
if _, err := textDetector.DetectBest(content); err != nil {
|
||||
return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err
|
||||
}
|
||||
times := 1024 / len(content)
|
||||
detectContent = make([]byte, 0, times*len(content))
|
||||
for range times {
|
||||
detectContent = append(detectContent, content...)
|
||||
}
|
||||
} else {
|
||||
detectContent = content
|
||||
}
|
||||
|
||||
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie-break
|
||||
results, err := textDetector.DetectAll(detectContent)
|
||||
if err != nil {
|
||||
return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err
|
||||
}
|
||||
|
||||
topConfidence := results[0].Confidence
|
||||
topResult := results[0]
|
||||
priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
|
||||
for _, result := range results {
|
||||
// As results are sorted in confidence order - if we have a different confidence
|
||||
// we know it's less than the current confidence and can break out of the loop early
|
||||
if result.Confidence != topConfidence {
|
||||
break
|
||||
}
|
||||
|
||||
// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guess
|
||||
resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
|
||||
if resultHas && (!has || resultPriority < priority) {
|
||||
topResult = result
|
||||
priority = resultPriority
|
||||
has = true
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
|
||||
if topResult.Charset != "UTF-8" && setting.Repository.AnsiCharset != "" {
|
||||
return setting.Repository.AnsiCharset, err
|
||||
}
|
||||
|
||||
return topResult.Charset, nil
|
||||
}
|
||||
@@ -0,0 +1,247 @@
|
||||
// Copyright 2019 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/test"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
setting.Repository.DetectedCharsetScore = map[string]int{}
|
||||
for i, charset := range setting.Repository.DetectedCharsetsOrder {
|
||||
setting.Repository.DetectedCharsetScore[strings.ToLower(charset)] = i
|
||||
}
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestMaybeRemoveBOM(t *testing.T) {
|
||||
res := maybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
res = maybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
}
|
||||
|
||||
func TestToUTF8(t *testing.T) {
|
||||
// Note: golang compiler seems so behave differently depending on the current
|
||||
// locale, so some conversions might behave differently. For that reason, we don't
|
||||
// depend on particular conversions but in expected behaviors.
|
||||
|
||||
res := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
|
||||
assert.Equal(t, "ABC", string(res))
|
||||
|
||||
// "áéíóú"
|
||||
res = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
// "áéíóú"
|
||||
res = ToUTF8([]byte{
|
||||
0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
|
||||
0xc3, 0xba,
|
||||
}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
res = ToUTF8([]byte{
|
||||
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
|
||||
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
|
||||
}, ConvertOpts{})
|
||||
stringMustStartWith(t, "Hola,", res)
|
||||
stringMustEndWith(t, "AAA.", res)
|
||||
|
||||
res = ToUTF8([]byte{
|
||||
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
|
||||
0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
|
||||
}, ConvertOpts{})
|
||||
stringMustStartWith(t, "Hola,", res)
|
||||
stringMustEndWith(t, "AAA.", res)
|
||||
|
||||
res = ToUTF8([]byte{
|
||||
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
|
||||
0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
|
||||
}, ConvertOpts{})
|
||||
stringMustStartWith(t, "Hola,", res)
|
||||
stringMustEndWith(t, "AAA.", res)
|
||||
|
||||
// Japanese (Shift-JIS)
|
||||
// 日属秘ぞしちゅ。
|
||||
res = ToUTF8([]byte{
|
||||
0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
|
||||
0xBF, 0x82, 0xE3, 0x81, 0x42,
|
||||
}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{
|
||||
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
|
||||
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
|
||||
}, res)
|
||||
|
||||
res = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
|
||||
}
|
||||
|
||||
func TestToUTF8WithFallback(t *testing.T) {
|
||||
// "ABC"
|
||||
res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
|
||||
|
||||
// "áéíóú"
|
||||
res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
// UTF8 BOM + "áéíóú"
|
||||
res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
// "Hola, así cómo ños"
|
||||
res = ToUTF8WithFallback([]byte{
|
||||
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
|
||||
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73,
|
||||
}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{
|
||||
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63,
|
||||
0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73,
|
||||
}, res)
|
||||
|
||||
// "Hola, así cómo "
|
||||
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
|
||||
|
||||
res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{})
|
||||
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
|
||||
assert.Equal(t, minmatch, res[0:len(minmatch)])
|
||||
|
||||
res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{})
|
||||
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
|
||||
assert.Equal(t, minmatch, res[0:len(minmatch)])
|
||||
|
||||
// Japanese (Shift-JIS)
|
||||
// "日属秘ぞしちゅ。"
|
||||
res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{
|
||||
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
|
||||
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
|
||||
}, res)
|
||||
|
||||
res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
|
||||
}
|
||||
|
||||
func TestToUTF8DropErrors(t *testing.T) {
|
||||
// "ABC"
|
||||
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
|
||||
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
|
||||
|
||||
// "áéíóú"
|
||||
res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
// UTF8 BOM + "áéíóú"
|
||||
res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
// "Hola, así cómo ños"
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
|
||||
assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8])
|
||||
assert.Equal(t, []byte{0x73}, res[len(res)-1:])
|
||||
|
||||
// "Hola, así cómo "
|
||||
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
|
||||
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
|
||||
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
|
||||
assert.Equal(t, minmatch, res[0:len(minmatch)])
|
||||
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
|
||||
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
|
||||
assert.Equal(t, minmatch, res[0:len(minmatch)])
|
||||
|
||||
// Japanese (Shift-JIS)
|
||||
// "日属秘ぞしちゅ。"
|
||||
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
|
||||
assert.Equal(t, []byte{
|
||||
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
|
||||
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
|
||||
}, res)
|
||||
|
||||
res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
|
||||
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
|
||||
}
|
||||
|
||||
func TestDetectEncoding(t *testing.T) {
|
||||
testSuccess := func(b []byte, expected string) {
|
||||
encoding, err := DetectEncoding(b)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, expected, encoding)
|
||||
}
|
||||
|
||||
// invalid bytes
|
||||
encoding, err := DetectEncoding([]byte{0xfa})
|
||||
assert.Error(t, err)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
|
||||
// utf-8
|
||||
b := []byte("just some ascii")
|
||||
testSuccess(b, "UTF-8")
|
||||
|
||||
// utf-8-sig: "hey" (with BOM)
|
||||
b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79}
|
||||
testSuccess(b, "UTF-8")
|
||||
|
||||
// utf-16: "hey<accented G>"
|
||||
b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01}
|
||||
testSuccess(b, "UTF-16LE")
|
||||
|
||||
// iso-8859-1: d<accented e>cor<newline>
|
||||
b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
|
||||
encoding, err = DetectEncoding(b)
|
||||
assert.NoError(t, err)
|
||||
assert.Contains(t, encoding, "ISO-8859-1")
|
||||
|
||||
defer test.MockVariableValue(&setting.Repository.AnsiCharset, "MyEncoding")()
|
||||
testSuccess(b, "MyEncoding")
|
||||
}
|
||||
|
||||
func stringMustStartWith(t *testing.T, expected string, value []byte) {
|
||||
assert.Equal(t, expected, string(value[:len(expected)]))
|
||||
}
|
||||
|
||||
func stringMustEndWith(t *testing.T, expected string, value []byte) {
|
||||
assert.Equal(t, expected, string(value[len(value)-len(expected):]))
|
||||
}
|
||||
|
||||
func TestToUTF8WithFallbackReader(t *testing.T) {
|
||||
test.MockVariableValue(&ToUTF8WithFallbackReaderPrefetchSize)
|
||||
|
||||
block := "aá啊🤔"
|
||||
runes := []rune(block)
|
||||
assert.Len(t, string(runes[0]), 1)
|
||||
assert.Len(t, string(runes[1]), 2)
|
||||
assert.Len(t, string(runes[2]), 3)
|
||||
assert.Len(t, string(runes[3]), 4)
|
||||
|
||||
content := strings.Repeat(block, 2)
|
||||
for i := 1; i < len(content); i++ {
|
||||
encoding, err := DetectEncoding([]byte(content[:i]))
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
|
||||
ToUTF8WithFallbackReaderPrefetchSize = i
|
||||
rd := ToUTF8WithFallbackReader(strings.NewReader(content), ConvertOpts{})
|
||||
r, _ := io.ReadAll(rd)
|
||||
assert.Equal(t, content, string(r))
|
||||
}
|
||||
for _, r := range runes {
|
||||
content = "abc abc " + string(r) + string(r) + string(r)
|
||||
for i := 0; i < len(content); i++ {
|
||||
encoding, err := DetectEncoding([]byte(content[:i]))
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"html/template"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/translation"
|
||||
)
|
||||
|
||||
type EscapeOptions struct {
|
||||
Allowed map[rune]bool
|
||||
}
|
||||
|
||||
func AllowRuneNBSP() map[rune]bool {
|
||||
return map[rune]bool{0xa0: true}
|
||||
}
|
||||
|
||||
func EscapeOptionsForView() EscapeOptions {
|
||||
return EscapeOptions{
|
||||
// it's safe to see NBSP in the view, but maybe not in the diff
|
||||
Allowed: AllowRuneNBSP(),
|
||||
}
|
||||
}
|
||||
|
||||
// EscapeControlHTML escapes the Unicode control sequences in a provided html document
|
||||
func EscapeControlHTML(html template.HTML, locale translation.Locale, opts ...EscapeOptions) (escaped *EscapeStatus, output template.HTML) {
|
||||
if !setting.UI.AmbiguousUnicodeDetection {
|
||||
return &EscapeStatus{}, html
|
||||
}
|
||||
sb := &strings.Builder{}
|
||||
escaped, _ = EscapeControlReader(strings.NewReader(string(html)), sb, locale, opts...) // err has been handled in EscapeControlReader
|
||||
return escaped, template.HTML(sb.String())
|
||||
}
|
||||
|
||||
// EscapeControlReader escapes the Unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus
|
||||
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, opts ...EscapeOptions) (*EscapeStatus, error) {
|
||||
return escapeStream(locale, reader, writer, opts...)
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
// EscapeStatus represents the findings of the Unicode escaper
|
||||
type EscapeStatus struct {
|
||||
Escaped bool // it means that some characters were escaped, and they can also be unescaped back
|
||||
HasInvisible bool
|
||||
HasAmbiguous bool
|
||||
}
|
||||
|
||||
// Or combines two EscapeStatus structs into one representing the conjunction of the two
|
||||
func (status *EscapeStatus) Or(other *EscapeStatus) *EscapeStatus {
|
||||
st := status
|
||||
if status == nil {
|
||||
st = &EscapeStatus{}
|
||||
}
|
||||
st.Escaped = st.Escaped || other.Escaped
|
||||
st.HasAmbiguous = st.HasAmbiguous || other.HasAmbiguous
|
||||
st.HasInvisible = st.HasInvisible || other.HasInvisible
|
||||
return st
|
||||
}
|
||||
@@ -0,0 +1,418 @@
|
||||
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"html"
|
||||
"io"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/translation"
|
||||
)
|
||||
|
||||
type htmlChunkReader struct {
|
||||
in io.Reader
|
||||
readErr error
|
||||
readBuf []byte
|
||||
curInTag bool
|
||||
}
|
||||
|
||||
type escapeStreamer struct {
|
||||
htmlChunkReader
|
||||
|
||||
escaped *EscapeStatus
|
||||
locale translation.Locale
|
||||
ambiguousTables []*AmbiguousTable
|
||||
allowed map[rune]bool
|
||||
|
||||
out io.Writer
|
||||
}
|
||||
|
||||
func escapeStream(locale translation.Locale, in io.Reader, out io.Writer, opts ...EscapeOptions) (*EscapeStatus, error) {
|
||||
es := &escapeStreamer{
|
||||
escaped: &EscapeStatus{},
|
||||
locale: locale,
|
||||
ambiguousTables: AmbiguousTablesForLocale(locale),
|
||||
htmlChunkReader: htmlChunkReader{
|
||||
in: in,
|
||||
readBuf: make([]byte, 0, 32*1024),
|
||||
},
|
||||
out: out,
|
||||
}
|
||||
|
||||
if len(opts) > 0 {
|
||||
es.allowed = opts[0].Allowed
|
||||
}
|
||||
|
||||
readCount := 0
|
||||
lastIsTag := false
|
||||
for {
|
||||
parts, partInTag, err := es.readRunes()
|
||||
readCount++
|
||||
if err == io.EOF {
|
||||
return es.escaped, nil
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for i, part := range parts {
|
||||
if partInTag[i] {
|
||||
lastIsTag = true
|
||||
if _, err := out.Write(part); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
// if last part is tag, then this part is content begin
|
||||
// if the content is the first part of the first read, then it's also content begin
|
||||
isContentBegin := lastIsTag || (readCount == 1 && i == 0)
|
||||
lastIsTag = false
|
||||
if isContentBegin {
|
||||
if part, err = es.trimAndWriteBom(part); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if err = es.detectAndWriteRunes(part); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) trimAndWriteBom(part []byte) ([]byte, error) {
|
||||
remaining, ok := bytes.CutPrefix(part, globalVars().utf8Bom)
|
||||
if ok {
|
||||
part = remaining
|
||||
if _, err := e.out.Write(globalVars().utf8Bom); err != nil {
|
||||
return part, err
|
||||
}
|
||||
}
|
||||
return part, nil
|
||||
}
|
||||
|
||||
const longSentenceDetectionLimit = 20
|
||||
|
||||
func (e *escapeStreamer) possibleLongSentence(results []detectResult, pos int) bool {
|
||||
countBasic := 0
|
||||
countNonASCII := 0
|
||||
for i := max(pos-longSentenceDetectionLimit, 0); i < min(pos+longSentenceDetectionLimit, len(results)); i++ {
|
||||
if results[i].runeType == runeTypeBasic && results[i].runeChar != ' ' {
|
||||
countBasic++
|
||||
}
|
||||
if results[i].runeType == runeTypeNonASCII || results[i].runeType == runeTypeAmbiguous {
|
||||
countNonASCII++
|
||||
}
|
||||
}
|
||||
countChar := countBasic + countNonASCII
|
||||
// many non-ASCII runes around, it seems to be a sentence,
|
||||
// don't handle the invisible/ambiguous chars in it, otherwise it will be too noisy
|
||||
return countChar != 0 && countNonASCII*100/countChar >= 50
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) analyzeDetectResults(results []detectResult) {
|
||||
for i := range results {
|
||||
res := &results[i]
|
||||
if res.runeType == runeTypeInvisible || res.runeType == runeTypeAmbiguous {
|
||||
leftIsNonASCII := i > 0 && (results[i-1].runeType == runeTypeNonASCII || results[i-1].runeType == runeTypeAmbiguous)
|
||||
rightIsNonASCII := i < len(results)-1 && (results[i+1].runeType == runeTypeNonASCII || results[i+1].runeType == runeTypeAmbiguous)
|
||||
surroundingNonASCII := leftIsNonASCII || rightIsNonASCII
|
||||
if !surroundingNonASCII {
|
||||
if len(results) < longSentenceDetectionLimit {
|
||||
res.needEscape = setting.UI.AmbiguousUnicodeDetection
|
||||
} else if !e.possibleLongSentence(results, i) {
|
||||
res.needEscape = setting.UI.AmbiguousUnicodeDetection
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) detectAndWriteRunes(part []byte) error {
|
||||
results := e.detectRunes(part)
|
||||
e.analyzeDetectResults(results)
|
||||
return e.writeDetectResults(part, results)
|
||||
}
|
||||
|
||||
func (e *htmlChunkReader) readRunes() (parts [][]byte, partInTag []bool, _ error) {
|
||||
// we have read everything, eof
|
||||
if e.readErr != nil && len(e.readBuf) == 0 {
|
||||
return nil, nil, e.readErr
|
||||
}
|
||||
|
||||
// not eof, and the there is space in the buffer, try to read more data
|
||||
if e.readErr == nil && len(e.readBuf) <= cap(e.readBuf)*3/4 {
|
||||
n, err := e.in.Read(e.readBuf[len(e.readBuf):cap(e.readBuf)])
|
||||
e.readErr = err
|
||||
e.readBuf = e.readBuf[:len(e.readBuf)+n]
|
||||
}
|
||||
if len(e.readBuf) == 0 {
|
||||
return nil, nil, e.readErr
|
||||
}
|
||||
|
||||
// try to exact tag parts and content parts
|
||||
pos := 0
|
||||
for pos < len(e.readBuf) {
|
||||
var curPartEnd int
|
||||
nextInTag := e.curInTag
|
||||
if e.curInTag {
|
||||
// if cur part is in tag, try to find the tag close char '>'
|
||||
idx := bytes.IndexByte(e.readBuf[pos:], '>')
|
||||
if idx == -1 {
|
||||
// if no tag close char, then the whole buffer is in tag
|
||||
curPartEnd = len(e.readBuf)
|
||||
} else {
|
||||
// tag part ends, switch to content part
|
||||
curPartEnd = pos + idx + 1
|
||||
nextInTag = !nextInTag
|
||||
}
|
||||
} else {
|
||||
// if cur part is in content, try to find the tag open char '<'
|
||||
idx := bytes.IndexByte(e.readBuf[pos:], '<')
|
||||
if idx == -1 {
|
||||
// if no tag open char, then the whole buffer is in content
|
||||
curPartEnd = len(e.readBuf)
|
||||
} else {
|
||||
// content part ends, switch to tag part
|
||||
curPartEnd = pos + idx
|
||||
nextInTag = !nextInTag
|
||||
}
|
||||
}
|
||||
|
||||
curPartLen := curPartEnd - pos
|
||||
if curPartLen == 0 {
|
||||
// if cur part is empty, only need to switch the part type
|
||||
if e.curInTag == nextInTag {
|
||||
panic("impossible, curPartLen is 0 but the part in tag status is not switched")
|
||||
}
|
||||
e.curInTag = nextInTag
|
||||
continue
|
||||
}
|
||||
|
||||
// now, curPartLen can't be 0
|
||||
curPart := make([]byte, curPartLen)
|
||||
copy(curPart, e.readBuf[pos:curPartEnd])
|
||||
// now we get the curPart bytes, but we can't directly use it, the last rune in it might have been cut
|
||||
// try to decode the last rune, if it's invalid, then we cut the last byte and try again until we get a valid rune or no byte left
|
||||
for i := curPartLen - 1; i >= 0; i-- {
|
||||
last, lastSize := utf8.DecodeRune(curPart[i:])
|
||||
if last == utf8.RuneError && lastSize == 1 {
|
||||
curPartLen--
|
||||
} else {
|
||||
curPartLen += lastSize - 1
|
||||
break
|
||||
}
|
||||
}
|
||||
if curPartLen == 0 {
|
||||
// actually it's impossible that the part doesn't contain any valid rune,
|
||||
// the only case is that the cap(readBuf) is too small, or the origin contain indeed doesn't contain any valid rune
|
||||
// * try to leave the last 4 bytes (possible longest utf-8 encoding) to next round
|
||||
// * at least consume 1 byte to avoid infinite loop
|
||||
curPartLen = max(len(curPart)-utf8.UTFMax, 1)
|
||||
}
|
||||
|
||||
// if curPartLen is not the same as curPart, it means we have cut some bytes,
|
||||
// need to wait for more data if not eof
|
||||
trailingCorrupted := curPartLen != len(curPart)
|
||||
|
||||
// finally, we get the real part we need
|
||||
curPart = curPart[:curPartLen]
|
||||
parts = append(parts, curPart)
|
||||
partInTag = append(partInTag, e.curInTag)
|
||||
|
||||
pos += curPartLen
|
||||
e.curInTag = nextInTag
|
||||
|
||||
if trailingCorrupted && e.readErr == nil {
|
||||
// if the last part is corrupted, and we haven't reach eof, then we need to wait for more data to get the complete part
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
copy(e.readBuf, e.readBuf[pos:])
|
||||
e.readBuf = e.readBuf[:len(e.readBuf)-pos]
|
||||
return parts, partInTag, nil
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeDetectResults(data []byte, results []detectResult) error {
|
||||
lastWriteRawIdx := -1
|
||||
for idx := range results {
|
||||
res := &results[idx]
|
||||
if !res.needEscape {
|
||||
if lastWriteRawIdx == -1 {
|
||||
lastWriteRawIdx = idx
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if lastWriteRawIdx != -1 {
|
||||
if _, err := e.out.Write(data[results[lastWriteRawIdx].position:res.position]); err != nil {
|
||||
return err
|
||||
}
|
||||
lastWriteRawIdx = -1
|
||||
}
|
||||
switch res.runeType {
|
||||
case runeTypeBroken:
|
||||
if err := e.writeBrokenRune(data[res.position : res.position+res.runeSize]); err != nil {
|
||||
return err
|
||||
}
|
||||
case runeTypeAmbiguous:
|
||||
if err := e.writeAmbiguousRune(res.runeChar, res.confusable); err != nil {
|
||||
return err
|
||||
}
|
||||
case runeTypeInvisible:
|
||||
if err := e.writeInvisibleRune(res.runeChar); err != nil {
|
||||
return err
|
||||
}
|
||||
case runeTypeControlChar:
|
||||
if err := e.writeControlRune(res.runeChar); err != nil {
|
||||
return err
|
||||
}
|
||||
default:
|
||||
panic("unreachable")
|
||||
}
|
||||
}
|
||||
if lastWriteRawIdx != -1 {
|
||||
lastResult := results[len(results)-1]
|
||||
if _, err := e.out.Write(data[results[lastWriteRawIdx].position : lastResult.position+lastResult.runeSize]); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeBrokenRune(_ []byte) (err error) {
|
||||
// Although we'd like to use the original bytes to display (show the real broken content to users),
|
||||
// however, when this "escape stream" module is applied to the content, the content has already been processed by other modules.
|
||||
// So the invalid bytes just can't be kept till this step, in most (all) cases, the only thing we see here is utf8.RuneError
|
||||
_, err = io.WriteString(e.out, `<span class="broken-code-point">�</span>`)
|
||||
return err
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeEscapedCharHTML(tag1, attr, tag2, content, tag3 string) (err error) {
|
||||
_, err = io.WriteString(e.out, tag1)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = io.WriteString(e.out, html.EscapeString(attr))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = io.WriteString(e.out, tag2)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = io.WriteString(e.out, html.EscapeString(content))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = io.WriteString(e.out, tag3)
|
||||
return err
|
||||
}
|
||||
|
||||
func runeToHex(r rune) string {
|
||||
return fmt.Sprintf("[U+%04X]", r)
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeAmbiguousRune(r, c rune) (err error) {
|
||||
e.escaped.Escaped = true
|
||||
e.escaped.HasAmbiguous = true
|
||||
return e.writeEscapedCharHTML(
|
||||
`<span class="ambiguous-code-point" data-tooltip-content="`,
|
||||
e.locale.TrString("repo.ambiguous_character", string(r)+" "+runeToHex(r), string(c)+" "+runeToHex(c)),
|
||||
`"><span class="char">`,
|
||||
string(r),
|
||||
`</span></span>`,
|
||||
)
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeInvisibleRune(r rune) error {
|
||||
e.escaped.Escaped = true
|
||||
e.escaped.HasInvisible = true
|
||||
return e.writeEscapedCharHTML(
|
||||
`<span class="escaped-code-point" data-escaped="`,
|
||||
runeToHex(r),
|
||||
`"><span class="char">`,
|
||||
string(r),
|
||||
`</span></span>`,
|
||||
)
|
||||
}
|
||||
|
||||
func (e *escapeStreamer) writeControlRune(r rune) error {
|
||||
var display string
|
||||
if r >= 0 && r <= 0x1f {
|
||||
display = string(0x2400 + r)
|
||||
} else if r == 0x7f {
|
||||
display = string(rune(0x2421))
|
||||
} else {
|
||||
display = runeToHex(r)
|
||||
}
|
||||
return e.writeEscapedCharHTML(
|
||||
`<span class="broken-code-point" data-escaped="`,
|
||||
display,
|
||||
`"><span class="char">`,
|
||||
string(r),
|
||||
`</span></span>`,
|
||||
)
|
||||
}
|
||||
|
||||
type detectResult struct {
|
||||
runeChar rune
|
||||
runeType int
|
||||
runeSize int
|
||||
position int
|
||||
confusable rune
|
||||
needEscape bool
|
||||
}
|
||||
|
||||
const (
|
||||
runeTypeBasic int = iota
|
||||
runeTypeBroken
|
||||
runeTypeNonASCII
|
||||
runeTypeAmbiguous
|
||||
runeTypeInvisible
|
||||
runeTypeControlChar
|
||||
)
|
||||
|
||||
func (e *escapeStreamer) detectRunes(data []byte) []detectResult {
|
||||
runeCount := utf8.RuneCount(data)
|
||||
results := make([]detectResult, runeCount)
|
||||
invisibleRangeTable := globalVars().invisibleRangeTable
|
||||
var i int
|
||||
var confusable rune
|
||||
for pos := 0; pos < len(data); i++ {
|
||||
r, runeSize := utf8.DecodeRune(data[pos:])
|
||||
results[i].runeChar = r
|
||||
results[i].runeSize = runeSize
|
||||
results[i].position = pos
|
||||
pos += runeSize
|
||||
|
||||
switch {
|
||||
case r == utf8.RuneError:
|
||||
results[i].runeType = runeTypeBroken
|
||||
results[i].needEscape = true
|
||||
case r == ' ' || r == '\t' || r == '\n' || e.allowed[r]:
|
||||
results[i].runeType = runeTypeBasic
|
||||
if r >= 0x80 {
|
||||
results[i].runeType = runeTypeNonASCII
|
||||
}
|
||||
case r < 0x20 || r == 0x7f:
|
||||
results[i].runeType = runeTypeControlChar
|
||||
results[i].needEscape = true
|
||||
case unicode.Is(invisibleRangeTable, r):
|
||||
results[i].runeType = runeTypeInvisible
|
||||
// not sure about results[i].needEscape, will be detected separately
|
||||
case isAmbiguous(r, &confusable, e.ambiguousTables...):
|
||||
results[i].runeType = runeTypeAmbiguous
|
||||
results[i].confusable = confusable
|
||||
// not sure about results[i].needEscape, will be detected separately
|
||||
case r >= 0x80:
|
||||
results[i].runeType = runeTypeNonASCII
|
||||
default: // details to basic runes
|
||||
}
|
||||
}
|
||||
return results
|
||||
}
|
||||
@@ -0,0 +1,212 @@
|
||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/test"
|
||||
"gitea.dev/modules/translation"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type escapeControlTest struct {
|
||||
name string
|
||||
text string
|
||||
status EscapeStatus
|
||||
result string
|
||||
}
|
||||
|
||||
var escapeControlTests = []escapeControlTest{
|
||||
{
|
||||
name: "<empty>",
|
||||
},
|
||||
{
|
||||
name: "single line western",
|
||||
text: "single line western",
|
||||
result: "single line western",
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "multi line western",
|
||||
text: "single line western\nmulti line western\n",
|
||||
result: "single line western\nmulti line western\n",
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "multi line western non-breaking space",
|
||||
text: "single line western\nmulti line western\n",
|
||||
result: `single line<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>western` + "\n" + `multi line<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>western` + "\n",
|
||||
status: EscapeStatus{Escaped: true, HasInvisible: true},
|
||||
},
|
||||
{
|
||||
name: "mixed scripts: western + japanese",
|
||||
text: "日属秘ぞしちゅ。Then some western.",
|
||||
result: "日属秘ぞしちゅ。Then some western.",
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "japanese",
|
||||
text: "日属秘ぞしちゅ。",
|
||||
result: "日属秘ぞしちゅ。",
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "hebrew", // old test was wrong, such text shouldn't be escaped
|
||||
text: "עד תקופת יוון העתיקה היה העיסוק במתמטיקה תכליתי בלבד: היא שימשה כאוסף של נוסחאות לחישוב קרקע, אוכלוסין וכו'. פריצת הדרך של היוונים, פרט לתרומותיהם הגדולות לידע המתמטי, הייתה בלימוד המתמטיקה כשלעצמה, מתוקף ערכה הרוחני. יחסם של חלק מהיוונים הקדמונים למתמטיקה היה דתי - למשל, הכת שאסף סביבו פיתגורס האמינה כי המתמטיקה היא הבסיס לכל הדברים. היוונים נחשבים ליוצרי מושג ההוכחה המתמטית, וכן לראשונים שעסקו במתמטיקה לשם עצמה, כלומר כתחום מחקרי עיוני ומופשט ולא רק כעזר שימושי. עם זאת, לצדה",
|
||||
result: "עד תקופת יוון העתיקה היה העיסוק במתמטיקה תכליתי בלבד: היא שימשה כאוסף של נוסחאות לחישוב קרקע, אוכלוסין וכו'. פריצת הדרך של היוונים, פרט לתרומותיהם הגדולות לידע המתמטי, הייתה בלימוד המתמטיקה כשלעצמה, מתוקף ערכה הרוחני. יחסם של חלק מהיוונים הקדמונים למתמטיקה היה דתי - למשל, הכת שאסף סביבו פיתגורס האמינה כי המתמטיקה היא הבסיס לכל הדברים. היוונים נחשבים ליוצרי מושג ההוכחה המתמטית, וכן לראשונים שעסקו במתמטיקה לשם עצמה, כלומר כתחום מחקרי עיוני ומופשט ולא רק כעזר שימושי. עם זאת, לצדה",
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "more hebrew", // old test was wrong, such text shouldn't be escaped
|
||||
text: `בתקופה מאוחרת יותר, השתמשו היוונים בשיטת סימון מתקדמת יותר, שבה הוצגו המספרים לפי 22 אותיות האלפבית היווני. לסימון המספרים בין 1 ל-9 נקבעו תשע האותיות הראשונות, בתוספת גרש ( ' ) בצד ימין של האות, למעלה; תשע האותיות הבאות ייצגו את העשרות מ-10 עד 90, והבאות את המאות. לסימון הספרות בין 1000 ל-900,000, השתמשו היוונים באותן אותיות, אך הוסיפו לאותיות את הגרש דווקא מצד שמאל של האותיות, למטה. ממיליון ומעלה, כנראה השתמשו היוונים בשני תגים במקום אחד.
|
||||
|
||||
המתמטיקאי הבולט הראשון ביוון העתיקה, ויש האומרים בתולדות האנושות, הוא תאלס (624 לפנה"ס - 546 לפנה"ס בקירוב).[1] לא יהיה זה משולל יסוד להניח שהוא האדם הראשון שהוכיח משפט מתמטי, ולא רק גילה אותו. תאלס הוכיח שישרים מקבילים חותכים מצד אחד של שוקי זווית קטעים בעלי יחסים שווים (משפט תאלס הראשון), שהזווית המונחת על קוטר במעגל היא זווית ישרה (משפט תאלס השני), שהקוטר מחלק את המעגל לשני חלקים שווים, ושזוויות הבסיס במשולש שווה-שוקיים שוות זו לזו. מיוחסות לו גם שיטות למדידת גובהן של הפירמידות בעזרת מדידת צילן ולקביעת מיקומה של ספינה הנראית מן החוף.
|
||||
|
||||
בשנים 582 לפנה"ס עד 496 לפנה"ס, בקירוב, חי מתמטיקאי חשוב במיוחד - פיתגורס. המקורות הראשוניים עליו מועטים, וההיסטוריונים מתקשים להפריד את העובדות משכבת המסתורין והאגדות שנקשרו בו. ידוע שסביבו התקבצה האסכולה הפיתגוראית מעין כת פסבדו-מתמטית שהאמינה ש"הכל מספר", או ליתר דיוק הכל ניתן לכימות, וייחסה למספרים משמעויות מיסטיות. ככל הנראה הפיתגוראים ידעו לבנות את הגופים האפלטוניים, הכירו את הממוצע האריתמטי, הממוצע הגאומטרי והממוצע ההרמוני והגיעו להישגים חשובים נוספים. ניתן לומר שהפיתגוראים גילו את היותו של השורש הריבועי של 2, שהוא גם האלכסון בריבוע שאורך צלעותיו 1, אי רציונלי, אך תגליתם הייתה למעשה רק שהקטעים "חסרי מידה משותפת", ומושג המספר האי רציונלי מאוחר יותר.[2] אזכור ראשון לקיומם של קטעים חסרי מידה משותפת מופיע בדיאלוג "תאיטיטוס" של אפלטון, אך רעיון זה היה מוכר עוד קודם לכן, במאה החמישית לפנה"ס להיפאסוס, בן האסכולה הפיתגוראית, ואולי לפיתגורס עצמו.[3]`,
|
||||
result: `בתקופה מאוחרת יותר, השתמשו היוונים בשיטת סימון מתקדמת יותר, שבה הוצגו המספרים לפי 22 אותיות האלפבית היווני. לסימון המספרים בין 1 ל-9 נקבעו תשע האותיות הראשונות, בתוספת גרש ( ' ) בצד ימין של האות, למעלה; תשע האותיות הבאות ייצגו את העשרות מ-10 עד 90, והבאות את המאות. לסימון הספרות בין 1000 ל-900,000, השתמשו היוונים באותן אותיות, אך הוסיפו לאותיות את הגרש דווקא מצד שמאל של האותיות, למטה. ממיליון ומעלה, כנראה השתמשו היוונים בשני תגים במקום אחד.
|
||||
|
||||
המתמטיקאי הבולט הראשון ביוון העתיקה, ויש האומרים בתולדות האנושות, הוא תאלס (624 לפנה"ס - 546 לפנה"ס בקירוב).[1] לא יהיה זה משולל יסוד להניח שהוא האדם הראשון שהוכיח משפט מתמטי, ולא רק גילה אותו. תאלס הוכיח שישרים מקבילים חותכים מצד אחד של שוקי זווית קטעים בעלי יחסים שווים (משפט תאלס הראשון), שהזווית המונחת על קוטר במעגל היא זווית ישרה (משפט תאלס השני), שהקוטר מחלק את המעגל לשני חלקים שווים, ושזוויות הבסיס במשולש שווה-שוקיים שוות זו לזו. מיוחסות לו גם שיטות למדידת גובהן של הפירמידות בעזרת מדידת צילן ולקביעת מיקומה של ספינה הנראית מן החוף.
|
||||
|
||||
בשנים 582 לפנה"ס עד 496 לפנה"ס, בקירוב, חי מתמטיקאי חשוב במיוחד - פיתגורס. המקורות הראשוניים עליו מועטים, וההיסטוריונים מתקשים להפריד את העובדות משכבת המסתורין והאגדות שנקשרו בו. ידוע שסביבו התקבצה האסכולה הפיתגוראית מעין כת פסבדו-מתמטית שהאמינה ש"הכל מספר", או ליתר דיוק הכל ניתן לכימות, וייחסה למספרים משמעויות מיסטיות. ככל הנראה הפיתגוראים ידעו לבנות את הגופים האפלטוניים, הכירו את הממוצע האריתמטי, הממוצע הגאומטרי והממוצע ההרמוני והגיעו להישגים חשובים נוספים. ניתן לומר שהפיתגוראים גילו את היותו של השורש הריבועי של 2, שהוא גם האלכסון בריבוע שאורך צלעותיו 1, אי רציונלי, אך תגליתם הייתה למעשה רק שהקטעים "חסרי מידה משותפת", ומושג המספר האי רציונלי מאוחר יותר.[2] אזכור ראשון לקיומם של קטעים חסרי מידה משותפת מופיע בדיאלוג "תאיטיטוס" של אפלטון, אך רעיון זה היה מוכר עוד קודם לכן, במאה החמישית לפנה"ס להיפאסוס, בן האסכולה הפיתגוראית, ואולי לפיתגורס עצמו.[3]`,
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "Mixed RTL+LTR",
|
||||
text: `Many computer programs fail to display bidirectional text correctly.
|
||||
For example, the Hebrew name Sarah (שרה) is spelled: sin (ש) (which appears rightmost),
|
||||
then resh (ר), and finally heh (ה) (which should appear leftmost).`,
|
||||
result: `Many computer programs fail to display bidirectional text correctly.
|
||||
For example, the Hebrew name Sarah (שרה) is spelled: sin (ש) (which appears rightmost),
|
||||
then resh (ר), and finally heh (ה) (which should appear leftmost).`,
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "Mixed RTL+LTR+BIDI",
|
||||
text: `Many computer programs fail to display bidirectional text correctly.
|
||||
For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" +
|
||||
`sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).`,
|
||||
result: `Many computer programs fail to display bidirectional text correctly.
|
||||
For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" +
|
||||
`sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).`,
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "Accented characters",
|
||||
text: string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}),
|
||||
result: string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}),
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "Program",
|
||||
text: "string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})",
|
||||
result: "string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})",
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "CVE testcase",
|
||||
text: "if access_level != \"user\u202E \u2066// Check if admin\u2069 \u2066\" {",
|
||||
result: `if access_level != "user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>" {`,
|
||||
status: EscapeStatus{Escaped: true, HasInvisible: true},
|
||||
},
|
||||
{
|
||||
name: "Mixed testcase with fail",
|
||||
text: `Many computer programs fail to display bidirectional text correctly.
|
||||
For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" +
|
||||
`sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).` +
|
||||
"\nif access_level != \"user\u202E \u2066// Check if admin\u2069 \u2066\" {\n",
|
||||
result: `Many computer programs fail to display bidirectional text correctly.
|
||||
For example, the Hebrew name Sarah ` + "\u2067" + `שרה` + "\u2066\n" +
|
||||
`sin (ש) (which appears rightmost), then resh (ר), and finally heh (ה) (which should appear leftmost).` +
|
||||
"\n" + `if access_level != "user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>" {` + "\n",
|
||||
status: EscapeStatus{Escaped: true, HasInvisible: true},
|
||||
},
|
||||
{
|
||||
// UTF-8/16/32 all use the same codepoint for BOM
|
||||
// Gitea could read UTF-16/32 content and convert into UTF-8 internally then render it, so we only process UTF-8 internally
|
||||
name: "UTF BOM",
|
||||
text: "\xef\xbb\xbftest",
|
||||
result: "\xef\xbb\xbftest",
|
||||
status: EscapeStatus{},
|
||||
},
|
||||
{
|
||||
name: "ambiguous",
|
||||
text: "O𝐾",
|
||||
result: `O<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character:𝐾 [U+1D43E],K [U+004B]"><span class="char">𝐾</span></span>`,
|
||||
status: EscapeStatus{Escaped: true, HasAmbiguous: true},
|
||||
},
|
||||
}
|
||||
|
||||
func TestEscapeControlReader(t *testing.T) {
|
||||
for _, tt := range escapeControlTests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
output := &strings.Builder{}
|
||||
status, err := EscapeControlReader(strings.NewReader(tt.text), output, &translation.MockLocale{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, tt.status, *status)
|
||||
outStr := output.String()
|
||||
assert.Equal(t, tt.result, outStr)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSettingAmbiguousUnicodeDetection(t *testing.T) {
|
||||
defer test.MockVariableValue(&setting.UI.AmbiguousUnicodeDetection, true)()
|
||||
_, out := EscapeControlHTML("a test", &translation.MockLocale{})
|
||||
assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>test`, out)
|
||||
setting.UI.AmbiguousUnicodeDetection = false
|
||||
_, out = EscapeControlHTML("a test", &translation.MockLocale{})
|
||||
assert.EqualValues(t, `a test`, out)
|
||||
}
|
||||
|
||||
func TestHTMLChunkReader(t *testing.T) {
|
||||
type textPart struct {
|
||||
text string
|
||||
isTag bool
|
||||
}
|
||||
testReadChunks := func(t *testing.T, chunkSize int, input string, expected []textPart) {
|
||||
r := &htmlChunkReader{in: strings.NewReader(input), readBuf: make([]byte, 0, chunkSize)}
|
||||
var results []textPart
|
||||
for {
|
||||
parts, partIsTag, err := r.readRunes()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
for i, part := range parts {
|
||||
results = append(results, textPart{string(part), partIsTag[i]})
|
||||
}
|
||||
}
|
||||
assert.Equal(t, expected, results, "chunk size: %d, input: %s", chunkSize, input)
|
||||
}
|
||||
|
||||
testReadChunks(t, 10, "abc<def>ghi", []textPart{
|
||||
{text: "abc", isTag: false},
|
||||
{text: "<def>", isTag: true},
|
||||
{text: "gh", isTag: false},
|
||||
// -- chunk
|
||||
{text: "i", isTag: false},
|
||||
})
|
||||
|
||||
testReadChunks(t, 10, "<abc><def>ghi", []textPart{
|
||||
{text: "<abc>", isTag: true},
|
||||
{text: "<def>", isTag: true},
|
||||
// -- chunk
|
||||
{text: "ghi", isTag: false},
|
||||
})
|
||||
|
||||
rune1, rune2, rune3, rune4 := "A", "é", "啊", "🌞"
|
||||
require.Len(t, rune1, 1)
|
||||
require.Len(t, rune2, 2)
|
||||
require.Len(t, rune3, 3)
|
||||
require.Len(t, rune4, 4)
|
||||
input := "<" + rune1 + rune2 + rune3 + rune4 + ">" + rune1 + rune2 + rune3 + rune4
|
||||
testReadChunks(t, 4, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞", true}, {">", true}, {"Aé", false}, {"啊", false}, {"🌞", false}})
|
||||
testReadChunks(t, 5, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞>", true}, {"Aé", false}, {"啊", false}, {"🌞", false}})
|
||||
testReadChunks(t, 6, input, []textPart{{"<Aé", true}, {"啊", true}, {"🌞>", true}, {"A", false}, {"é啊", false}, {"🌞", false}})
|
||||
testReadChunks(t, 7, input, []textPart{{"<Aé啊", true}, {"🌞>", true}, {"A", false}, {"é啊", false}, {"🌞", false}})
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,201 @@
|
||||
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"go/format"
|
||||
"log"
|
||||
"os"
|
||||
"sort"
|
||||
"text/template"
|
||||
"unicode"
|
||||
|
||||
"gitea.dev/modules/json"
|
||||
|
||||
"golang.org/x/text/unicode/rangetable"
|
||||
)
|
||||
|
||||
// ambiguous.json provides a one to one mapping of ambiguous characters to other characters
|
||||
// See https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
|
||||
|
||||
type AmbiguousTable struct {
|
||||
Confusable []rune
|
||||
With []rune
|
||||
Locale string
|
||||
RangeTable *unicode.RangeTable
|
||||
}
|
||||
|
||||
type RunePair struct {
|
||||
Confusable rune
|
||||
With rune
|
||||
}
|
||||
|
||||
// InvisibleRunes these are runes that vscode has assigned to be invisible
|
||||
// See https://github.com/hediet/vscode-unicode-data
|
||||
var InvisibleRunes = []rune{
|
||||
9, 10, 11, 12, 13, 32, 127, 160, 173, 847, 1564, 4447, 4448, 6068, 6069, 6155, 6156, 6157, 6158, 7355, 7356, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8203, 8204, 8205, 8206, 8207, 8234, 8235, 8236, 8237, 8238, 8239, 8287, 8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299, 8300, 8301, 8302, 8303, 10240, 12288, 12644, 65024, 65025, 65026, 65027, 65028, 65029, 65030, 65031, 65032, 65033, 65034, 65035, 65036, 65037, 65038, 65039, 65279, 65440, 65520, 65521, 65522, 65523, 65524, 65525, 65526, 65527, 65528, 65532, 78844, 119155, 119156, 119157, 119158, 119159, 119160, 119161, 119162, 917504, 917505, 917506, 917507, 917508, 917509, 917510, 917511, 917512, 917513, 917514, 917515, 917516, 917517, 917518, 917519, 917520, 917521, 917522, 917523, 917524, 917525, 917526, 917527, 917528, 917529, 917530, 917531, 917532, 917533, 917534, 917535, 917536, 917537, 917538, 917539, 917540, 917541, 917542, 917543, 917544, 917545, 917546, 917547, 917548, 917549, 917550, 917551, 917552, 917553, 917554, 917555, 917556, 917557, 917558, 917559, 917560, 917561, 917562, 917563, 917564, 917565, 917566, 917567, 917568, 917569, 917570, 917571, 917572, 917573, 917574, 917575, 917576, 917577, 917578, 917579, 917580, 917581, 917582, 917583, 917584, 917585, 917586, 917587, 917588, 917589, 917590, 917591, 917592, 917593, 917594, 917595, 917596, 917597, 917598, 917599, 917600, 917601, 917602, 917603, 917604, 917605, 917606, 917607, 917608, 917609, 917610, 917611, 917612, 917613, 917614, 917615, 917616, 917617, 917618, 917619, 917620, 917621, 917622, 917623, 917624, 917625, 917626, 917627, 917628, 917629, 917630, 917631, 917760, 917761, 917762, 917763, 917764, 917765, 917766, 917767, 917768, 917769, 917770, 917771, 917772, 917773, 917774, 917775, 917776, 917777, 917778, 917779, 917780, 917781, 917782, 917783, 917784, 917785, 917786, 917787, 917788, 917789, 917790, 917791, 917792, 917793, 917794, 917795, 917796, 917797, 917798, 917799, 917800, 917801, 917802, 917803, 917804, 917805, 917806, 917807, 917808, 917809, 917810, 917811, 917812, 917813, 917814, 917815, 917816, 917817, 917818, 917819, 917820, 917821, 917822, 917823, 917824, 917825, 917826, 917827, 917828, 917829, 917830, 917831, 917832, 917833, 917834, 917835, 917836, 917837, 917838, 917839, 917840, 917841, 917842, 917843, 917844, 917845, 917846, 917847, 917848, 917849, 917850, 917851, 917852, 917853, 917854, 917855, 917856, 917857, 917858, 917859, 917860, 917861, 917862, 917863, 917864, 917865, 917866, 917867, 917868, 917869, 917870, 917871, 917872, 917873, 917874, 917875, 917876, 917877, 917878, 917879, 917880, 917881, 917882, 917883, 917884, 917885, 917886, 917887, 917888, 917889, 917890, 917891, 917892, 917893, 917894, 917895, 917896, 917897, 917898, 917899, 917900, 917901, 917902, 917903, 917904, 917905, 917906, 917907, 917908, 917909, 917910, 917911, 917912, 917913, 917914, 917915, 917916, 917917, 917918, 917919, 917920, 917921, 917922, 917923, 917924, 917925, 917926, 917927, 917928, 917929, 917930, 917931, 917932, 917933, 917934, 917935, 917936, 917937, 917938, 917939, 917940, 917941, 917942, 917943, 917944, 917945, 917946, 917947, 917948, 917949, 917950, 917951, 917952, 917953, 917954, 917955, 917956, 917957, 917958, 917959, 917960, 917961, 917962, 917963, 917964, 917965, 917966, 917967, 917968, 917969, 917970, 917971, 917972, 917973, 917974, 917975, 917976, 917977, 917978, 917979, 917980, 917981, 917982, 917983, 917984, 917985, 917986, 917987, 917988, 917989, 917990, 917991, 917992, 917993, 917994, 917995, 917996, 917997, 917998, 917999,
|
||||
}
|
||||
|
||||
func generateAmbiguous() {
|
||||
bs, err := os.ReadFile("ambiguous.json")
|
||||
if err != nil {
|
||||
log.Fatalf("Unable to read, err: %v", err)
|
||||
}
|
||||
|
||||
var unwrapped string
|
||||
if err := json.Unmarshal(bs, &unwrapped); err != nil {
|
||||
log.Fatalf("Unable to unwrap content in, err: %v", err)
|
||||
}
|
||||
|
||||
fromJSON := map[string][]uint32{}
|
||||
if err := json.Unmarshal([]byte(unwrapped), &fromJSON); err != nil {
|
||||
log.Fatalf("Unable to unmarshal content in, err: %v", err)
|
||||
}
|
||||
|
||||
tables := make([]*AmbiguousTable, 0, len(fromJSON))
|
||||
for locale, chars := range fromJSON {
|
||||
table := &AmbiguousTable{Locale: locale}
|
||||
table.Confusable = make([]rune, 0, len(chars)/2)
|
||||
table.With = make([]rune, 0, len(chars)/2)
|
||||
pairs := make([]RunePair, len(chars)/2)
|
||||
for i := 0; i < len(chars); i += 2 {
|
||||
pairs[i/2].Confusable, pairs[i/2].With = rune(chars[i]), rune(chars[i+1])
|
||||
}
|
||||
sort.Slice(pairs, func(i, j int) bool {
|
||||
return pairs[i].Confusable < pairs[j].Confusable
|
||||
})
|
||||
for _, pair := range pairs {
|
||||
table.Confusable = append(table.Confusable, pair.Confusable)
|
||||
table.With = append(table.With, pair.With)
|
||||
}
|
||||
table.RangeTable = rangetable.New(table.Confusable...)
|
||||
tables = append(tables, table)
|
||||
}
|
||||
sort.Slice(tables, func(i, j int) bool {
|
||||
return tables[i].Locale < tables[j].Locale
|
||||
})
|
||||
data := map[string]any{"Tables": tables}
|
||||
|
||||
if err := runTemplate(templateAmbiguous, "../ambiguous_gen.go", &data); err != nil {
|
||||
log.Fatalf("Unable to run template: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func generateInvisible() {
|
||||
// First we filter the runes to remove
|
||||
// <space><tab><newline>
|
||||
filtered := make([]rune, 0, len(InvisibleRunes))
|
||||
for _, r := range InvisibleRunes {
|
||||
if r == ' ' || r == '\t' || r == '\n' {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, r)
|
||||
}
|
||||
|
||||
table := rangetable.New(filtered...)
|
||||
if err := runTemplate(generatorInvisible, "../invisible_gen.go", table); err != nil {
|
||||
log.Fatalf("Unable to run template: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func runTemplate(t *template.Template, filename string, data any) error {
|
||||
buf := bytes.NewBuffer(nil)
|
||||
if err := t.Execute(buf, data); err != nil {
|
||||
return fmt.Errorf("unable to execute template: %w", err)
|
||||
}
|
||||
bs, err := format.Source(buf.Bytes())
|
||||
if err != nil {
|
||||
log.Printf("Bad source:\n%s", buf.String())
|
||||
return fmt.Errorf("unable to format source: %w", err)
|
||||
}
|
||||
|
||||
old, err := os.ReadFile(filename)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("failed to read old file %s because %w", filename, err)
|
||||
} else if err == nil {
|
||||
if bytes.Equal(bs, old) {
|
||||
// files are the same don't rewrite it.
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create file %s because %w", filename, err)
|
||||
}
|
||||
defer file.Close()
|
||||
_, err = file.Write(bs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to write generated source: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
generateAmbiguous()
|
||||
generateInvisible()
|
||||
}
|
||||
|
||||
var templateAmbiguous = template.Must(template.New("ambiguousTemplate").Parse(`// This file is generated by modules/charset/generate/generate.go DO NOT EDIT
|
||||
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import "unicode"
|
||||
|
||||
// This file is generated from https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
|
||||
|
||||
// AmbiguousTable matches a confusable rune with its partner for the Locale
|
||||
type AmbiguousTable struct {
|
||||
Confusable []rune
|
||||
With []rune
|
||||
Locale string
|
||||
RangeTable *unicode.RangeTable
|
||||
}
|
||||
|
||||
func newAmbiguousTableMap() map[string]*AmbiguousTable {
|
||||
return map[string]*AmbiguousTable {
|
||||
{{- range .Tables}}
|
||||
{{printf "%q" .Locale}}: {
|
||||
Confusable: []rune{ {{range .Confusable}}{{.}},{{end}} },
|
||||
With: []rune{ {{range .With}}{{.}},{{end}} },
|
||||
Locale: {{printf "%q" .Locale}},
|
||||
RangeTable: &unicode.RangeTable{
|
||||
R16: []unicode.Range16{
|
||||
{{range .RangeTable.R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
|
||||
{{end}} },
|
||||
R32: []unicode.Range32{
|
||||
{{range .RangeTable.R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
|
||||
{{end}} },
|
||||
LatinOffset: {{.RangeTable.LatinOffset}},
|
||||
},
|
||||
},
|
||||
{{end}}
|
||||
}
|
||||
}
|
||||
`))
|
||||
|
||||
var generatorInvisible = template.Must(template.New("invisibleTemplate").Parse(`// This file is generated by modules/charset/generate/generate.go DO NOT EDIT
|
||||
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import "unicode"
|
||||
|
||||
func newInvisibleRangeTable() *unicode.RangeTable {
|
||||
return &unicode.RangeTable{
|
||||
R16: []unicode.Range16{
|
||||
{{range .R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
|
||||
{{end}}},
|
||||
R32: []unicode.Range32{
|
||||
{{range .R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
|
||||
{{end}}},
|
||||
LatinOffset: {{.LatinOffset}},
|
||||
}
|
||||
}
|
||||
`))
|
||||
@@ -0,0 +1,38 @@
|
||||
// This file is generated by modules/charset/generate/generate.go DO NOT EDIT
|
||||
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package charset
|
||||
|
||||
import "unicode"
|
||||
|
||||
func newInvisibleRangeTable() *unicode.RangeTable {
|
||||
return &unicode.RangeTable{
|
||||
R16: []unicode.Range16{
|
||||
{Lo: 11, Hi: 13, Stride: 1},
|
||||
{Lo: 127, Hi: 160, Stride: 33},
|
||||
{Lo: 173, Hi: 847, Stride: 674},
|
||||
{Lo: 1564, Hi: 4447, Stride: 2883},
|
||||
{Lo: 4448, Hi: 6068, Stride: 1620},
|
||||
{Lo: 6069, Hi: 6155, Stride: 86},
|
||||
{Lo: 6156, Hi: 6158, Stride: 1},
|
||||
{Lo: 7355, Hi: 7356, Stride: 1},
|
||||
{Lo: 8192, Hi: 8207, Stride: 1},
|
||||
{Lo: 8234, Hi: 8239, Stride: 1},
|
||||
{Lo: 8287, Hi: 8303, Stride: 1},
|
||||
{Lo: 10240, Hi: 12288, Stride: 2048},
|
||||
{Lo: 12644, Hi: 65024, Stride: 52380},
|
||||
{Lo: 65025, Hi: 65039, Stride: 1},
|
||||
{Lo: 65279, Hi: 65440, Stride: 161},
|
||||
{Lo: 65520, Hi: 65528, Stride: 1},
|
||||
{Lo: 65532, Hi: 65532, Stride: 1},
|
||||
},
|
||||
R32: []unicode.Range32{
|
||||
{Lo: 78844, Hi: 119155, Stride: 40311},
|
||||
{Lo: 119156, Hi: 119162, Stride: 1},
|
||||
{Lo: 917504, Hi: 917631, Stride: 1},
|
||||
{Lo: 917760, Hi: 917999, Stride: 1},
|
||||
},
|
||||
LatinOffset: 2,
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user