初始提交: Gitea 项目代码
This commit is contained in:
@@ -0,0 +1,163 @@
|
||||
// Copyright 2015 The Gogs Authors. All rights reserved.
|
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package highlight
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
gohtml "html"
|
||||
"html/template"
|
||||
"sync"
|
||||
|
||||
"gitea.dev/modules/log"
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/util"
|
||||
|
||||
"github.com/alecthomas/chroma/v2"
|
||||
chromahtml "github.com/alecthomas/chroma/v2/formatters/html"
|
||||
"github.com/alecthomas/chroma/v2/styles"
|
||||
)
|
||||
|
||||
// don't highlight files larger than this many bytes for performance purposes
|
||||
const sizeLimit = 1024 * 1024
|
||||
|
||||
type globalVarsType struct {
|
||||
highlightMapping map[string]string
|
||||
githubStyles *chroma.Style
|
||||
}
|
||||
|
||||
var (
|
||||
globalVarsMu sync.Mutex
|
||||
globalVarsPtr *globalVarsType
|
||||
)
|
||||
|
||||
func globalVars() *globalVarsType {
|
||||
// in the future, the globalVars might need to be re-initialized when settings change, so don't use sync.Once here
|
||||
globalVarsMu.Lock()
|
||||
defer globalVarsMu.Unlock()
|
||||
if globalVarsPtr == nil {
|
||||
globalVarsPtr = &globalVarsType{}
|
||||
globalVarsPtr.githubStyles = styles.Get("github")
|
||||
globalVarsPtr.highlightMapping = setting.GetHighlightMapping()
|
||||
}
|
||||
return globalVarsPtr
|
||||
}
|
||||
|
||||
// UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags
|
||||
// It always includes '\n', '\n' can appear at the end of each line or in the middle of HTML tags
|
||||
// The '\n' is necessary for copying code from web UI to preserve original code lines
|
||||
// ATTENTION: It uses the unsafe conversion between string and []byte for performance reason
|
||||
// DO NOT make any modification to the returned [][]byte slice items
|
||||
func UnsafeSplitHighlightedLines(code template.HTML) (ret [][]byte) {
|
||||
buf := util.UnsafeStringToBytes(string(code))
|
||||
lineCount := bytes.Count(buf, []byte("\n")) + 1
|
||||
ret = make([][]byte, 0, lineCount)
|
||||
nlTagClose := []byte("\n</")
|
||||
for {
|
||||
pos := bytes.IndexByte(buf, '\n')
|
||||
if pos == -1 {
|
||||
if len(buf) > 0 {
|
||||
ret = append(ret, buf)
|
||||
}
|
||||
return ret
|
||||
}
|
||||
// Chroma highlighting output sometimes have "</span>" right after \n, sometimes before.
|
||||
// * "<span>text\n</span>"
|
||||
// * "<span>text</span>\n"
|
||||
if bytes.HasPrefix(buf[pos:], nlTagClose) {
|
||||
pos1 := bytes.IndexByte(buf[pos:], '>')
|
||||
if pos1 != -1 {
|
||||
pos += pos1
|
||||
}
|
||||
}
|
||||
ret = append(ret, buf[:pos+1])
|
||||
buf = buf[pos+1:]
|
||||
}
|
||||
}
|
||||
|
||||
func htmlEscape(code string) template.HTML {
|
||||
return template.HTML(gohtml.EscapeString(code))
|
||||
}
|
||||
|
||||
// RenderCodeSlowGuess tries to get a lexer by file name and language first,
|
||||
// if not found, it will try to guess the lexer by code content, which is slow (more than several hundreds of milliseconds).
|
||||
func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexer chroma.Lexer, lexerDisplayName string) {
|
||||
// diff view newline will be passed as empty, change to literal '\n' so it can be copied
|
||||
// preserve literal newline in blame view
|
||||
if code == "" || code == "\n" {
|
||||
return "\n", nil, ""
|
||||
}
|
||||
|
||||
if len(code) > sizeLimit {
|
||||
return htmlEscape(code), nil, ""
|
||||
}
|
||||
|
||||
lexer = detectChromaLexerWithAnalyze(fileName, language, util.UnsafeStringToBytes(code)) // it is also slow
|
||||
return RenderCodeByLexer(lexer, code), lexer, formatLexerName(lexer.Config().Name)
|
||||
}
|
||||
|
||||
// RenderCodeByLexer returns a HTML version of code string with chroma syntax highlighting classes
|
||||
func RenderCodeByLexer(lexer chroma.Lexer, code string) template.HTML {
|
||||
formatter := chromahtml.New(chromahtml.WithClasses(true),
|
||||
chromahtml.WithLineNumbers(false),
|
||||
chromahtml.PreventSurroundingPre(true),
|
||||
)
|
||||
|
||||
iterator, err := lexer.Tokenise(nil, code)
|
||||
if err != nil {
|
||||
log.Error("Can't tokenize code: %v", err)
|
||||
return htmlEscape(code)
|
||||
}
|
||||
|
||||
htmlBuf := &bytes.Buffer{}
|
||||
// style not used for live site but need to pass something
|
||||
err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
|
||||
if err != nil {
|
||||
log.Error("Can't format code: %v", err)
|
||||
return htmlEscape(code)
|
||||
}
|
||||
return template.HTML(util.UnsafeBytesToString(htmlBuf.Bytes()))
|
||||
}
|
||||
|
||||
// RenderFullFile returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
|
||||
func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string) {
|
||||
if language == LanguagePlaintext || len(code) > sizeLimit {
|
||||
return renderPlainText(code), formatLexerName(LanguagePlaintext)
|
||||
}
|
||||
lexer := detectChromaLexerWithAnalyze(fileName, language, code)
|
||||
lexerName := formatLexerName(lexer.Config().Name)
|
||||
rendered := RenderCodeByLexer(lexer, util.UnsafeBytesToString(code))
|
||||
unsafeLines := UnsafeSplitHighlightedLines(rendered)
|
||||
lines := make([]template.HTML, len(unsafeLines))
|
||||
for idx, lineBytes := range unsafeLines {
|
||||
lines[idx] = template.HTML(util.UnsafeBytesToString(lineBytes))
|
||||
}
|
||||
return lines, lexerName
|
||||
}
|
||||
|
||||
// renderPlainText returns non-highlighted HTML for code
|
||||
func renderPlainText(code []byte) []template.HTML {
|
||||
lines := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
|
||||
pos := 0
|
||||
for pos < len(code) {
|
||||
var content []byte
|
||||
nextPos := bytes.IndexByte(code[pos:], '\n')
|
||||
if nextPos == -1 {
|
||||
content = code[pos:]
|
||||
pos = len(code)
|
||||
} else {
|
||||
content = code[pos : pos+nextPos+1]
|
||||
pos += nextPos + 1
|
||||
}
|
||||
lines = append(lines, htmlEscape(util.UnsafeBytesToString(content)))
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
func formatLexerName(name string) string {
|
||||
if name == LanguagePlaintext || name == chromaLexerFallback {
|
||||
return "Plaintext"
|
||||
}
|
||||
return util.ToTitleCaseNoLower(name)
|
||||
}
|
||||
@@ -0,0 +1,218 @@
|
||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package highlight
|
||||
|
||||
import (
|
||||
"html/template"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func lines(s string) (out []template.HTML) {
|
||||
// "" => [], "a" => ["a"], "a\n" => ["a\n"], "a\nb" => ["a\n", "b"] (each line always includes EOL "\n" if it exists)
|
||||
out = make([]template.HTML, 0)
|
||||
s = strings.ReplaceAll(strings.ReplaceAll(strings.TrimSpace(s), "\n", ""), `\n`, "\n")
|
||||
for {
|
||||
if p := strings.IndexByte(s, '\n'); p != -1 {
|
||||
out = append(out, template.HTML(s[:p+1]))
|
||||
s = s[p+1:]
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
if s != "" {
|
||||
out = append(out, template.HTML(s))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func TestFile(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
code string
|
||||
want []template.HTML
|
||||
lexerName string
|
||||
}{
|
||||
{
|
||||
name: "empty.py",
|
||||
code: "",
|
||||
want: lines(""),
|
||||
lexerName: "Python",
|
||||
},
|
||||
{
|
||||
name: "empty.js",
|
||||
code: "",
|
||||
want: lines(""),
|
||||
lexerName: "JavaScript",
|
||||
},
|
||||
{
|
||||
name: "empty.yaml",
|
||||
code: "",
|
||||
want: lines(""),
|
||||
lexerName: "YAML",
|
||||
},
|
||||
{
|
||||
name: "tags.txt",
|
||||
code: "<>",
|
||||
want: lines("<>"),
|
||||
lexerName: "Plaintext",
|
||||
},
|
||||
{
|
||||
name: "tags.py",
|
||||
code: "<>",
|
||||
want: lines(`<span class="o"><></span>`),
|
||||
lexerName: "Python",
|
||||
},
|
||||
{
|
||||
name: "eol-no.py",
|
||||
code: "a=1",
|
||||
want: lines(`<span class="n">a</span><span class="o">=</span><span class="mi">1</span>`),
|
||||
lexerName: "Python",
|
||||
},
|
||||
{
|
||||
name: "eol-newline1.py",
|
||||
code: "a=1\n",
|
||||
want: lines(`<span class="n">a</span><span class="o">=</span><span class="mi">1</span>\n`),
|
||||
lexerName: "Python",
|
||||
},
|
||||
{
|
||||
name: "eol-newline2.py",
|
||||
code: "a=1\n\n",
|
||||
want: lines(`
|
||||
<span class="n">a</span><span class="o">=</span><span class="mi">1</span>\n
|
||||
\n
|
||||
`,
|
||||
),
|
||||
lexerName: "Python",
|
||||
},
|
||||
{
|
||||
name: "empty-line-with-space.py",
|
||||
code: strings.ReplaceAll(strings.TrimSpace(`
|
||||
def:
|
||||
a=1
|
||||
|
||||
b=''
|
||||
{space}
|
||||
c=2
|
||||
`), "{space}", " "),
|
||||
want: lines(`
|
||||
<span class="n">def</span><span class="p">:</span>\n
|
||||
<span class="n">a</span><span class="o">=</span><span class="mi">1</span>\n
|
||||
\n
|
||||
<span class="n">b</span><span class="o">=</span><span class="s1">''</span>\n
|
||||
\n
|
||||
<span class="n">c</span><span class="o">=</span><span class="mi">2</span>`,
|
||||
),
|
||||
lexerName: "Python",
|
||||
},
|
||||
{
|
||||
name: "test.sql",
|
||||
code: "--\nSELECT",
|
||||
want: []template.HTML{"<span class=\"c1\">--\n</span>", `<span class="k">SELECT</span>`},
|
||||
lexerName: "SQL",
|
||||
},
|
||||
{
|
||||
name: "test.http",
|
||||
code: `HTTP/1.0 400 Bad request
|
||||
Content-Type: text/html
|
||||
|
||||
<html></html>`,
|
||||
want: lines(`<span class="kr">HTTP</span><span class="o">/</span><span class="m">1.0</span> <span class="m">400</span> <span class="ne">Bad request</span>\n
|
||||
<span class="n">Content-Type</span><span class="o">:</span> <span class="l">text/html</span>\n
|
||||
\n
|
||||
<span class="p"><</span><span class="nt">html</span><span class="p">></</span><span class="nt">html</span><span class="p">></span>`),
|
||||
lexerName: "HTTP",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
out, lexerName := RenderFullFile(tt.name, "", []byte(tt.code))
|
||||
assert.Equal(t, tt.want, out)
|
||||
assert.Equal(t, tt.lexerName, lexerName)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlainText(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
code string
|
||||
want []template.HTML
|
||||
}{
|
||||
{
|
||||
name: "empty.py",
|
||||
code: "",
|
||||
want: lines(""),
|
||||
},
|
||||
{
|
||||
name: "tags.py",
|
||||
code: "<>",
|
||||
want: lines("<>"),
|
||||
},
|
||||
{
|
||||
name: "eol-no.py",
|
||||
code: "a=1",
|
||||
want: lines(`a=1`),
|
||||
},
|
||||
{
|
||||
name: "eol-newline1.py",
|
||||
code: "a=1\n",
|
||||
want: lines(`a=1\n`),
|
||||
},
|
||||
{
|
||||
name: "eol-newline2.py",
|
||||
code: "a=1\n\n",
|
||||
want: lines(`
|
||||
a=1\n
|
||||
\n
|
||||
`),
|
||||
},
|
||||
{
|
||||
name: "empty-line-with-space.py",
|
||||
code: strings.ReplaceAll(strings.TrimSpace(`
|
||||
def:
|
||||
a=1
|
||||
|
||||
b=''
|
||||
{space}
|
||||
c=2
|
||||
`), "{space}", " "),
|
||||
want: lines(`
|
||||
def:\n
|
||||
a=1\n
|
||||
\n
|
||||
b=''\n
|
||||
\n
|
||||
c=2`),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
out := renderPlainText([]byte(tt.code))
|
||||
assert.Equal(t, tt.want, out)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnsafeSplitHighlightedLines(t *testing.T) {
|
||||
ret := UnsafeSplitHighlightedLines("")
|
||||
assert.Empty(t, ret)
|
||||
|
||||
ret = UnsafeSplitHighlightedLines("a")
|
||||
assert.Len(t, ret, 1)
|
||||
assert.Equal(t, "a", string(ret[0]))
|
||||
|
||||
ret = UnsafeSplitHighlightedLines("\n")
|
||||
assert.Len(t, ret, 1)
|
||||
assert.Equal(t, "\n", string(ret[0]))
|
||||
|
||||
ret = UnsafeSplitHighlightedLines("<span>a</span>\n<span>b\n</span>")
|
||||
assert.Len(t, ret, 2)
|
||||
assert.Equal(t, "<span>a</span>\n", string(ret[0]))
|
||||
assert.Equal(t, "<span>b\n</span>", string(ret[1]))
|
||||
}
|
||||
@@ -0,0 +1,312 @@
|
||||
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package highlight
|
||||
|
||||
import (
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"gitea.dev/modules/analyze"
|
||||
"gitea.dev/modules/log"
|
||||
|
||||
"github.com/alecthomas/chroma/v2"
|
||||
"github.com/alecthomas/chroma/v2/lexers"
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
mapKeyLowerPrefix = "lower/"
|
||||
LanguagePlaintext = "plaintext"
|
||||
chromaLexerFallback = "fallback"
|
||||
)
|
||||
|
||||
// chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name
|
||||
// Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.)
|
||||
var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
conflictingExtLangMap map[string]string
|
||||
conflictingAliasLangMap map[string]string
|
||||
|
||||
lowerNameMap map[string]chroma.Lexer // lexer name (lang name) in lower-case
|
||||
fileBaseMap map[string]chroma.Lexer
|
||||
fileExtMap map[string]chroma.Lexer
|
||||
fileParts []struct {
|
||||
part string
|
||||
lexer chroma.Lexer
|
||||
}
|
||||
},
|
||||
) {
|
||||
ret.lowerNameMap = make(map[string]chroma.Lexer)
|
||||
ret.fileBaseMap = make(map[string]chroma.Lexer)
|
||||
ret.fileExtMap = make(map[string]chroma.Lexer)
|
||||
|
||||
// Chroma has conflicts in file extension for different languages,
|
||||
// When we need to do fast render, there is no way to detect the language by content,
|
||||
// So we can only choose some default languages for the conflicted file extensions.
|
||||
ret.conflictingExtLangMap = map[string]string{
|
||||
".as": "ActionScript 3", // ActionScript
|
||||
".asm": "NASM", // TASM, NASM, RGBDS Assembly, Z80 Assembly
|
||||
".ASM": "NASM",
|
||||
".bas": "VB.net", // QBasic
|
||||
".bf": "Beef", // Brainfuck
|
||||
".fs": "FSharp", // Forth
|
||||
".gd": "GDScript", // GDScript3
|
||||
".h": "C", // Objective-C
|
||||
".hcl": "Terraform", // HCL
|
||||
".hh": "C++", // HolyC
|
||||
".inc": "PHP", // ObjectPascal, POVRay, SourcePawn, PHTML
|
||||
".m": "Objective-C", // Matlab, Mathematica, Mason
|
||||
".mc": "Mason", // MonkeyC
|
||||
".mod": "AMPL", // Modula-2
|
||||
".network": "SYSTEMD", // INI
|
||||
".php": "PHP", // PHTML
|
||||
".php3": "PHP", // PHTML
|
||||
".php4": "PHP", // PHTML
|
||||
".php5": "PHP", // PHTML
|
||||
".pl": "Perl", // Prolog, Raku
|
||||
".pm": "Perl", // Promela, Raku
|
||||
".pp": "ObjectPascal", // Puppet
|
||||
".s": "ArmAsm", // GAS
|
||||
".S": "ArmAsm", // R, GAS
|
||||
".service": "SYSTEMD", // INI
|
||||
".socket": "SYSTEMD", // INI
|
||||
".sql": "SQL", // MySQL
|
||||
".t": "Perl", // Raku
|
||||
".ts": "TypeScript", // TypoScript
|
||||
".v": "V", // verilog
|
||||
".xslt": "HTML", // XML
|
||||
}
|
||||
// use widely used language names as the default mapping to resolve name alias conflict
|
||||
ret.conflictingAliasLangMap = map[string]string{
|
||||
"hcl": "HCL", // Terraform
|
||||
"v": "V", // verilog
|
||||
}
|
||||
|
||||
isPlainPattern := func(key string) bool {
|
||||
return !strings.ContainsAny(key, "*?[]") // only support simple patterns
|
||||
}
|
||||
|
||||
setFileNameMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) {
|
||||
if _, conflict := m[key]; conflict {
|
||||
panic("duplicate key in lexer map: " + key + ", need to add it to conflictingExtLangMap")
|
||||
}
|
||||
m[key] = lexer
|
||||
m[mapKeyLowerPrefix+strings.ToLower(key)] = lexer
|
||||
}
|
||||
|
||||
processFileName := func(fileName string, lexer chroma.Lexer) bool {
|
||||
if isPlainPattern(fileName) {
|
||||
// full base name match
|
||||
setFileNameMapWithLowerKey(ret.fileBaseMap, fileName, lexer)
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(fileName, "*") {
|
||||
// ext name match: "*.js"
|
||||
fileExt := strings.Trim(fileName, "*")
|
||||
if isPlainPattern(fileExt) {
|
||||
presetName := ret.conflictingExtLangMap[fileExt]
|
||||
if presetName == "" || lexer.Config().Name == presetName {
|
||||
setFileNameMapWithLowerKey(ret.fileExtMap, fileExt, lexer)
|
||||
}
|
||||
return true
|
||||
}
|
||||
}
|
||||
if strings.HasSuffix(fileName, "*") {
|
||||
// part match: "*.env.*"
|
||||
filePart := strings.Trim(fileName, "*")
|
||||
if isPlainPattern(filePart) {
|
||||
ret.fileParts = append(ret.fileParts, struct {
|
||||
part string
|
||||
lexer chroma.Lexer
|
||||
}{
|
||||
part: filePart,
|
||||
lexer: lexer,
|
||||
})
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
expandGlobPatterns := func(patterns []string) []string {
|
||||
// expand patterns like "file.[ch]" to "file.c" and "file.h", only one pair of "[]" is supported, enough for current Chroma lexers
|
||||
for idx, s := range patterns {
|
||||
idx1 := strings.IndexByte(s, '[')
|
||||
idx2 := strings.IndexByte(s, ']')
|
||||
if idx1 != -1 && idx2 != -1 && idx2 > idx1+1 {
|
||||
left, mid, right := s[:idx1], s[idx1+1:idx2], s[idx2+1:]
|
||||
patterns[idx] = left + mid[0:1] + right
|
||||
for i := 1; i < len(mid); i++ {
|
||||
patterns = append(patterns, left+mid[i:i+1]+right)
|
||||
}
|
||||
}
|
||||
}
|
||||
return patterns
|
||||
}
|
||||
|
||||
processLexerNameAliases := func(lexer chroma.Lexer) {
|
||||
cfg := lexer.Config()
|
||||
lowerName := strings.ToLower(cfg.Name)
|
||||
if _, conflicted := ret.lowerNameMap[lowerName]; conflicted {
|
||||
panic("duplicate language name in lexer map: " + lowerName)
|
||||
}
|
||||
ret.lowerNameMap[lowerName] = lexer
|
||||
|
||||
for _, name := range cfg.Aliases {
|
||||
lowerName := strings.ToLower(name)
|
||||
if overriddenName, overridden := ret.conflictingAliasLangMap[lowerName]; overridden && overriddenName != cfg.Name {
|
||||
continue
|
||||
}
|
||||
if existingLexer, conflict := ret.lowerNameMap[lowerName]; conflict && existingLexer.Config().Name != cfg.Name {
|
||||
panic("duplicate alias in lexer map: " + name + ", conflict between " + existingLexer.Config().Name + " and " + cfg.Name)
|
||||
}
|
||||
ret.lowerNameMap[lowerName] = lexer
|
||||
}
|
||||
}
|
||||
|
||||
// the main loop: build our lookup maps for lexers
|
||||
for _, lexer := range lexers.GlobalLexerRegistry.Lexers {
|
||||
cfg := lexer.Config()
|
||||
processLexerNameAliases(lexer)
|
||||
for _, s := range expandGlobPatterns(cfg.Filenames) {
|
||||
if !processFileName(s, lexer) {
|
||||
panic("unsupported file name pattern in lexer: " + s)
|
||||
}
|
||||
}
|
||||
for _, s := range expandGlobPatterns(cfg.AliasFilenames) {
|
||||
if !processFileName(s, lexer) {
|
||||
panic("unsupported alias file name pattern in lexer: " + s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// final check: make sure the default overriding mapping is correct, nothing is missing
|
||||
for lowerName, lexerName := range ret.conflictingAliasLangMap {
|
||||
if lexer, ok := ret.lowerNameMap[lowerName]; !ok || lexer.Config().Name != lexerName {
|
||||
panic("missing default name-lang mapping for: " + lowerName)
|
||||
}
|
||||
}
|
||||
for ext, lexerName := range ret.conflictingExtLangMap {
|
||||
if lexer, ok := ret.fileExtMap[ext]; !ok || lexer.Config().Name != lexerName {
|
||||
panic("missing default ext-lang mapping for: " + ext)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
})
|
||||
|
||||
func normalizeFileNameLang(fileName, fileLang string) (string, string) {
|
||||
fileName = path.Base(fileName)
|
||||
fileLang, _, _ = strings.Cut(fileLang, "?") // maybe, the value from gitattributes might contain `?` parameters?
|
||||
ext := path.Ext(fileName)
|
||||
// the "lang" might come from enry or gitattributes, it has different naming for some languages
|
||||
switch fileLang {
|
||||
case "F#":
|
||||
fileLang = "FSharp"
|
||||
case "Pascal":
|
||||
fileLang = "ObjectPascal"
|
||||
case "C":
|
||||
if ext == ".C" || ext == ".H" {
|
||||
fileLang = "C++"
|
||||
}
|
||||
}
|
||||
return fileName, fileLang
|
||||
}
|
||||
|
||||
func DetectChromaLexerByFileName(fileName, fileLang string) chroma.Lexer {
|
||||
lexer, _ := detectChromaLexerByFileName(fileName, fileLang)
|
||||
return lexer
|
||||
}
|
||||
|
||||
func detectChromaLexerByFileName(fileName, fileLang string) (_ chroma.Lexer, byLang bool) {
|
||||
fileName, fileLang = normalizeFileNameLang(fileName, fileLang)
|
||||
fileExt := path.Ext(fileName)
|
||||
|
||||
// apply custom mapping for file extension, highest priority, for example:
|
||||
// * ".my-js" -> ".js"
|
||||
// * ".my-html" -> "HTML"
|
||||
if fileExt != "" {
|
||||
if val, ok := globalVars().highlightMapping[fileExt]; ok {
|
||||
if strings.HasPrefix(val, ".") {
|
||||
fileName = "dummy" + val
|
||||
fileLang = ""
|
||||
} else {
|
||||
fileLang = val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// try to use language for lexer name
|
||||
if fileLang != "" {
|
||||
lexer := chromaLexers().lowerNameMap[strings.ToLower(fileLang)]
|
||||
if lexer != nil {
|
||||
return lexer, true
|
||||
}
|
||||
}
|
||||
|
||||
if fileName == "" {
|
||||
return lexers.Fallback, false
|
||||
}
|
||||
|
||||
// try base name
|
||||
{
|
||||
baseName := path.Base(fileName)
|
||||
if lexer, ok := chromaLexers().fileBaseMap[baseName]; ok {
|
||||
return lexer, false
|
||||
} else if lexer, ok = chromaLexers().fileBaseMap[mapKeyLowerPrefix+strings.ToLower(baseName)]; ok {
|
||||
return lexer, false
|
||||
}
|
||||
}
|
||||
|
||||
if fileExt == "" {
|
||||
return lexers.Fallback, false
|
||||
}
|
||||
|
||||
// try ext name
|
||||
{
|
||||
if lexer, ok := chromaLexers().fileExtMap[fileExt]; ok {
|
||||
return lexer, false
|
||||
} else if lexer, ok = chromaLexers().fileExtMap[mapKeyLowerPrefix+strings.ToLower(fileExt)]; ok {
|
||||
return lexer, false
|
||||
}
|
||||
}
|
||||
|
||||
// try file part match, for example: ".env.local" for "*.env.*"
|
||||
// it assumes that there must be a dot in filename (fileExt isn't empty)
|
||||
for _, item := range chromaLexers().fileParts {
|
||||
if strings.Contains(fileName, item.part) {
|
||||
return item.lexer, false
|
||||
}
|
||||
}
|
||||
return lexers.Fallback, false
|
||||
}
|
||||
|
||||
// detectChromaLexerWithAnalyze returns a chroma lexer by given file name, language and code content. All parameters can be optional.
|
||||
// When code content is provided, it will be slow if no lexer is found by file name or language.
|
||||
// If no lexer is found, it will return the fallback lexer.
|
||||
func detectChromaLexerWithAnalyze(fileName, lang string, code []byte) chroma.Lexer {
|
||||
lexer, byLang := detectChromaLexerByFileName(fileName, lang)
|
||||
|
||||
// if lang is provided, and it matches a lexer, use it directly
|
||||
if byLang {
|
||||
return chroma.Coalesce(lexer)
|
||||
}
|
||||
|
||||
// if a lexer is detected and there is no conflict for the file extension, use it directly
|
||||
fileExt := path.Ext(fileName)
|
||||
_, hasConflicts := chromaLexers().conflictingExtLangMap[fileExt]
|
||||
if !hasConflicts && lexer != lexers.Fallback {
|
||||
return chroma.Coalesce(lexer)
|
||||
}
|
||||
|
||||
// try to detect language by content, for best guessing for the language
|
||||
// when using "code" to detect, analyze.GetCodeLanguage is slow, it iterates many rules to detect language from content
|
||||
analyzedLanguage := analyze.GetCodeLanguage(fileName, code)
|
||||
lexer, _ = detectChromaLexerByFileName(fileName, analyzedLanguage)
|
||||
if lexer == lexers.Fallback {
|
||||
if analyzedLanguage != enry.OtherLanguage {
|
||||
log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", analyzedLanguage, fileName)
|
||||
}
|
||||
}
|
||||
return chroma.Coalesce(lexer)
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package highlight
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/alecthomas/chroma/v2/lexers"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func BenchmarkDetectChromaLexerByFileName(b *testing.B) {
|
||||
for b.Loop() {
|
||||
// BenchmarkDetectChromaLexerByFileName-12 18214717 61.35 ns/op
|
||||
DetectChromaLexerByFileName("a.sql", "")
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDetectChromaLexerWithAnalyze(b *testing.B) {
|
||||
b.StopTimer()
|
||||
code := []byte(strings.Repeat("SELECT * FROM table;\n", 1000))
|
||||
b.StartTimer()
|
||||
for b.Loop() {
|
||||
// BenchmarkRenderCodeSlowGuess-12 87946 13310 ns/op
|
||||
detectChromaLexerWithAnalyze("a", "", code)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkChromaAnalyze(b *testing.B) {
|
||||
b.StopTimer()
|
||||
code := strings.Repeat("SELECT * FROM table;\n", 1000)
|
||||
b.StartTimer()
|
||||
for b.Loop() {
|
||||
// comparing to detectChromaLexerWithAnalyze (go-enry), "chroma/lexers.Analyse" is very slow
|
||||
// BenchmarkChromaAnalyze-12 519 2247104 ns/op
|
||||
lexers.Analyse(code)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRenderCodeByLexer(b *testing.B) {
|
||||
b.StopTimer()
|
||||
code := strings.Repeat("SELECT * FROM table;\n", 1000)
|
||||
lexer := DetectChromaLexerByFileName("a.sql", "")
|
||||
b.StartTimer()
|
||||
for b.Loop() {
|
||||
// Really slow ....... the regexp2 used by Chroma takes most of the time
|
||||
// BenchmarkRenderCodeByLexer-12 22 47159038 ns/op
|
||||
RenderCodeByLexer(lexer, code)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetectChromaLexer(t *testing.T) {
|
||||
globalVars().highlightMapping[".my-html"] = "HTML"
|
||||
t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") })
|
||||
|
||||
casesWithContent := []struct {
|
||||
fileName string
|
||||
language string
|
||||
content string
|
||||
expected string
|
||||
}{
|
||||
{"test.v", "", "", "V"},
|
||||
{"test.v", "any-lang-name", "", "V"},
|
||||
|
||||
{"any-file", "javascript", "", "JavaScript"},
|
||||
{"any-file", "", "/* vim: set filetype=python */", "Python"},
|
||||
{"any-file", "", "", "fallback"},
|
||||
|
||||
{"test.fs", "", "", "FSharp"},
|
||||
{"test.fs", "F#", "", "FSharp"},
|
||||
{"test.fs", "", "let x = 1", "FSharp"},
|
||||
|
||||
{"test.c", "", "", "C"},
|
||||
{"test.C", "", "", "C++"},
|
||||
{"OLD-CODE.PAS", "", "", "ObjectPascal"},
|
||||
{"test.my-html", "", "", "HTML"},
|
||||
|
||||
{"a.php", "", "", "PHP"},
|
||||
{"a.sql", "", "", "SQL"},
|
||||
{"dhcpd.conf", "", "", "ISCdhcpd"},
|
||||
{".env.my-production", "", "", "Bash"},
|
||||
|
||||
{"a.hcl", "", "", "HCL"}, // not the same as Chroma, enry detects "*.hcl" as "HCL"
|
||||
{"a.hcl", "HCL", "", "HCL"},
|
||||
{"a.hcl", "Terraform", "", "Terraform"},
|
||||
}
|
||||
for _, c := range casesWithContent {
|
||||
lexer := detectChromaLexerWithAnalyze(c.fileName, c.language, []byte(c.content))
|
||||
if assert.NotNil(t, lexer, "case: %+v", c) {
|
||||
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
|
||||
}
|
||||
}
|
||||
|
||||
casesNameLang := []struct {
|
||||
fileName string
|
||||
language string
|
||||
expected string
|
||||
byLang bool
|
||||
}{
|
||||
{"a.v", "", "V", false},
|
||||
{"a.v", "V", "V", true},
|
||||
{"a.v", "verilog", "verilog", true},
|
||||
{"a.v", "any-lang-name", "V", false},
|
||||
|
||||
{"a.hcl", "", "Terraform", false}, // not the same as enry
|
||||
{"a.hcl", "HCL", "HCL", true},
|
||||
{"a.hcl", "Terraform", "Terraform", true},
|
||||
}
|
||||
for _, c := range casesNameLang {
|
||||
lexer, byLang := detectChromaLexerByFileName(c.fileName, c.language)
|
||||
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
|
||||
assert.Equal(t, c.byLang, byLang, "case: %+v", c)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user