初始提交: Gitea 项目代码
This commit is contained in:
@@ -0,0 +1,189 @@
|
||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package typesniffer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
const SniffContentSize = 1024
|
||||
|
||||
const (
|
||||
MimeTypeImageSvg = "image/svg+xml"
|
||||
MimeTypeImageAvif = "image/avif"
|
||||
|
||||
MimeTypeApplicationOctetStream = "application/octet-stream"
|
||||
)
|
||||
|
||||
var globalVars = sync.OnceValue(func() (ret struct {
|
||||
svgComment, svgTagRegex, svgTagInXMLRegex *regexp.Regexp
|
||||
},
|
||||
) {
|
||||
ret.svgComment = regexp.MustCompile(`(?s)<!--.*?-->`)
|
||||
ret.svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
|
||||
ret.svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
|
||||
return ret
|
||||
})
|
||||
|
||||
// SniffedType contains information about a blob's type.
|
||||
type SniffedType struct {
|
||||
contentType string
|
||||
}
|
||||
|
||||
// IsText detects if the content format is text family, including text/plain, text/html, text/css, etc.
|
||||
func (ct SniffedType) IsText() bool {
|
||||
return strings.Contains(ct.contentType, "text/")
|
||||
}
|
||||
|
||||
func (ct SniffedType) IsTextPlain() bool {
|
||||
return strings.Contains(ct.contentType, "text/plain")
|
||||
}
|
||||
|
||||
// IsImage detects if data is an image format
|
||||
func (ct SniffedType) IsImage() bool {
|
||||
return strings.Contains(ct.contentType, "image/")
|
||||
}
|
||||
|
||||
// IsSvgImage detects if data is an SVG image format
|
||||
func (ct SniffedType) IsSvgImage() bool {
|
||||
return strings.Contains(ct.contentType, MimeTypeImageSvg)
|
||||
}
|
||||
|
||||
// IsPDF detects if data is a PDF format
|
||||
func (ct SniffedType) IsPDF() bool {
|
||||
return strings.Contains(ct.contentType, "application/pdf")
|
||||
}
|
||||
|
||||
// IsVideo detects if data is a video format
|
||||
func (ct SniffedType) IsVideo() bool {
|
||||
return strings.Contains(ct.contentType, "video/")
|
||||
}
|
||||
|
||||
// IsAudio detects if data is a video format
|
||||
func (ct SniffedType) IsAudio() bool {
|
||||
return strings.Contains(ct.contentType, "audio/")
|
||||
}
|
||||
|
||||
// IsRepresentableAsText returns true if file content can be represented as
|
||||
// plain text or is empty.
|
||||
func (ct SniffedType) IsRepresentableAsText() bool {
|
||||
return ct.IsText() || ct.IsSvgImage()
|
||||
}
|
||||
|
||||
// IsBrowsableBinaryType returns whether a non-text type can be displayed in a browser
|
||||
func (ct SniffedType) IsBrowsableBinaryType() bool {
|
||||
return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio()
|
||||
}
|
||||
|
||||
// GetMimeType returns the mime type
|
||||
func (ct SniffedType) GetMimeType() string {
|
||||
return strings.SplitN(ct.contentType, ";", 2)[0]
|
||||
}
|
||||
|
||||
// https://en.wikipedia.org/wiki/ISO_base_media_file_format#File_type_box
|
||||
func detectFileTypeBox(data []byte) (brands []string, found bool) {
|
||||
if len(data) < 12 {
|
||||
return nil, false
|
||||
}
|
||||
boxSize := int(binary.BigEndian.Uint32(data[:4]))
|
||||
if boxSize < 12 || boxSize > len(data) {
|
||||
return nil, false
|
||||
}
|
||||
tag := string(data[4:8])
|
||||
if tag != "ftyp" {
|
||||
return nil, false
|
||||
}
|
||||
brands = append(brands, string(data[8:12]))
|
||||
for i := 16; i+4 <= boxSize; i += 4 {
|
||||
brands = append(brands, string(data[i:i+4]))
|
||||
}
|
||||
return brands, true
|
||||
}
|
||||
|
||||
func isEmbeddedOpenType(data []byte) bool {
|
||||
// https://www.w3.org/submissions/EOT
|
||||
if len(data) < 80 {
|
||||
return false
|
||||
}
|
||||
version := binary.LittleEndian.Uint32(data[8:]) // Actually this standard is abandoned (for IE6-IE11 only), there are only 3 versions defined
|
||||
magic := binary.LittleEndian.Uint16(data[34:36]) // MagicNumber: 0x504C ("LP")
|
||||
reserved := data[64:80] // Reserved 1-4 (each: unsigned long)
|
||||
return (version == 0x00010000 || version == 0x00020001 || version == 0x00020002) && magic == 0x504C && bytes.Count(reserved, []byte{0}) == len(reserved)
|
||||
}
|
||||
|
||||
// DetectContentType extends http.DetectContentType with more content types. Defaults to text/plain if input is empty.
|
||||
func DetectContentType(data []byte) SniffedType {
|
||||
if len(data) == 0 {
|
||||
return SniffedType{"text/plain"}
|
||||
}
|
||||
|
||||
ct := http.DetectContentType(data)
|
||||
|
||||
if len(data) > SniffContentSize {
|
||||
data = data[:SniffContentSize]
|
||||
}
|
||||
|
||||
const typeMsFontObject = "application/vnd.ms-fontobject"
|
||||
if ct == typeMsFontObject {
|
||||
// Stupid Golang blindly detects any content with 34th-35th bytes being "LP" as font.
|
||||
// If it is not really for ".eot" content, we try to detect it again by hiding the "LP", see the test for more details.
|
||||
if isEmbeddedOpenType(data) {
|
||||
return SniffedType{typeMsFontObject}
|
||||
}
|
||||
data = slices.Clone(data)
|
||||
data[34] = 'l'
|
||||
ct = http.DetectContentType(data)
|
||||
}
|
||||
|
||||
vars := globalVars()
|
||||
// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
|
||||
detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
|
||||
detectByXML := strings.Contains(ct, "text/xml")
|
||||
if detectByHTML || detectByXML {
|
||||
dataProcessed := vars.svgComment.ReplaceAll(data, nil)
|
||||
dataProcessed = bytes.TrimSpace(dataProcessed)
|
||||
if detectByHTML && vars.svgTagRegex.Match(dataProcessed) ||
|
||||
detectByXML && vars.svgTagInXMLRegex.Match(dataProcessed) {
|
||||
ct = MimeTypeImageSvg
|
||||
}
|
||||
}
|
||||
|
||||
if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
|
||||
// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
|
||||
// So remove the "ID3" prefix and detect again, then if the result is "text", it must be text content.
|
||||
// This works especially because audio files contain many unprintable/invalid characters like `0x00`
|
||||
ct2 := http.DetectContentType(data[3:])
|
||||
if strings.HasPrefix(ct2, "text/") {
|
||||
ct = ct2
|
||||
}
|
||||
}
|
||||
|
||||
fileTypeBrands, found := detectFileTypeBox(data)
|
||||
if found && slices.Contains(fileTypeBrands, "avif") {
|
||||
ct = MimeTypeImageAvif
|
||||
}
|
||||
|
||||
if ct == "application/ogg" {
|
||||
dataHead := data
|
||||
if len(dataHead) > 256 {
|
||||
dataHead = dataHead[:256] // only need to do a quick check for the file header
|
||||
}
|
||||
if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) {
|
||||
ct = "video/ogg" // ogg is only used for some video formats, and it's not popular
|
||||
} else {
|
||||
ct = "audio/ogg" // for most cases, it is used as an audio container
|
||||
}
|
||||
}
|
||||
return SniffedType{ct}
|
||||
}
|
||||
|
||||
func FromContentType(contentType string) SniffedType {
|
||||
return SniffedType{contentType}
|
||||
}
|
||||
@@ -0,0 +1,179 @@
|
||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package typesniffer
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestDetectContentTypeLongerThanSniffLen(t *testing.T) {
|
||||
// Pre-condition: Shorter than sniffLen detects SVG.
|
||||
assert.Equal(t, "image/svg+xml", DetectContentType([]byte(`<!-- Comment --><svg></svg>`)).contentType)
|
||||
// Longer than sniffLen detects something else.
|
||||
assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", SniffContentSize)+` --><svg></svg>`)).contentType)
|
||||
}
|
||||
|
||||
func TestIsTextFile(t *testing.T) {
|
||||
assert.True(t, DetectContentType([]byte{}).IsText())
|
||||
assert.True(t, DetectContentType([]byte("lorem ipsum")).IsText())
|
||||
}
|
||||
|
||||
func TestIsSvgImage(t *testing.T) {
|
||||
assert.True(t, DetectContentType([]byte("<svg></svg>")).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(" <svg></svg>")).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<svg width="100"></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?><svg></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<!-- Comment -->
|
||||
<svg></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<!-- Multiple -->
|
||||
<!-- Comments -->
|
||||
<svg></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<!-- Multiline
|
||||
Comment -->
|
||||
<svg></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1 Basic//EN"
|
||||
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd">
|
||||
<svg></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- Comment -->
|
||||
<svg></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- Multiple -->
|
||||
<!-- Comments -->
|
||||
<svg></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- Multiline
|
||||
Comment -->
|
||||
<svg></svg>`)).IsSvgImage())
|
||||
assert.True(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<!-- Multiline
|
||||
Comment -->
|
||||
<svg></svg>`)).IsSvgImage())
|
||||
|
||||
// the DetectContentType should work for incomplete data, because only beginning bytes are used for detection
|
||||
assert.True(t, DetectContentType([]byte(`<svg>....`)).IsSvgImage())
|
||||
|
||||
assert.False(t, DetectContentType([]byte{}).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte("svg")).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte("<svgfoo></svgfoo>")).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte("text<svg></svg>")).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte("<html><body><svg></svg></body></html>")).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte(`<script>"<svg></svg>"</script>`)).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte(`<!-- <svg></svg> inside comment -->
|
||||
<foo></foo>`)).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte(`<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- <svg></svg> inside comment -->
|
||||
<foo></foo>`)).IsSvgImage())
|
||||
|
||||
assert.False(t, DetectContentType([]byte(`
|
||||
<!-- comment1 -->
|
||||
<div>
|
||||
<!-- comment2 -->
|
||||
<svg></svg>
|
||||
</div>
|
||||
`)).IsSvgImage())
|
||||
|
||||
assert.False(t, DetectContentType([]byte(`
|
||||
<!-- comment1
|
||||
-->
|
||||
<div>
|
||||
<!-- comment2
|
||||
-->
|
||||
<svg></svg>
|
||||
</div>
|
||||
`)).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte(`<html><body><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg></svg></body></html>`)).IsSvgImage())
|
||||
assert.False(t, DetectContentType([]byte(`<html><body><?xml version="1.0" encoding="UTF-8"?><svg></svg></body></html>`)).IsSvgImage())
|
||||
}
|
||||
|
||||
func TestIsPDF(t *testing.T) {
|
||||
pdf, _ := base64.StdEncoding.DecodeString("JVBERi0xLjYKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0ZURlY29kZT4+CnN0cmVhbQp4nF3NPwsCMQwF8D2f4s2CNYk1baF0EHRwOwg4iJt/NsFb/PpevUE4Mjwe")
|
||||
assert.True(t, DetectContentType(pdf).IsPDF())
|
||||
assert.False(t, DetectContentType([]byte("plain text")).IsPDF())
|
||||
}
|
||||
|
||||
func TestIsVideo(t *testing.T) {
|
||||
mp4, _ := base64.StdEncoding.DecodeString("AAAAGGZ0eXBtcDQyAAAAAGlzb21tcDQyAAEI721vb3YAAABsbXZoZAAAAADaBlwX2gZcFwAAA+gA")
|
||||
assert.True(t, DetectContentType(mp4).IsVideo())
|
||||
assert.False(t, DetectContentType([]byte("plain text")).IsVideo())
|
||||
}
|
||||
|
||||
func TestIsAudio(t *testing.T) {
|
||||
mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
|
||||
assert.True(t, DetectContentType(mp3).IsAudio())
|
||||
assert.False(t, DetectContentType([]byte("plain text")).IsAudio())
|
||||
|
||||
assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio())
|
||||
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText()) // test ID3 tag for plain text
|
||||
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char
|
||||
}
|
||||
|
||||
func TestDetectContentTypeOgg(t *testing.T) {
|
||||
oggAudio, _ := hex.DecodeString("4f67675300020000000000000000352f0000000000007dc39163011e01766f72626973000000000244ac0000000000000071020000000000b8014f6767530000")
|
||||
st := DetectContentType(oggAudio)
|
||||
assert.True(t, st.IsAudio())
|
||||
|
||||
oggVideo, _ := hex.DecodeString("4f676753000200000000000000007d9747ef000000009b59daf3012a807468656f7261030201001e00110001e000010e00020000001e00000001000001000001")
|
||||
st = DetectContentType(oggVideo)
|
||||
assert.True(t, st.IsVideo())
|
||||
}
|
||||
|
||||
func TestDetectFileTypeBox(t *testing.T) {
|
||||
_, found := detectFileTypeBox([]byte("\x00\x00\xff\xffftypAAAA...."))
|
||||
assert.False(t, found)
|
||||
|
||||
brands, found := detectFileTypeBox([]byte("\x00\x00\x00\x0cftypAAAA"))
|
||||
assert.True(t, found)
|
||||
assert.Equal(t, []string{"AAAA"}, brands)
|
||||
|
||||
brands, found = detectFileTypeBox([]byte("\x00\x00\x00\x10ftypAAAA....BBBB"))
|
||||
assert.True(t, found)
|
||||
assert.Equal(t, []string{"AAAA"}, brands)
|
||||
|
||||
brands, found = detectFileTypeBox([]byte("\x00\x00\x00\x14ftypAAAA....BBBB"))
|
||||
assert.True(t, found)
|
||||
assert.Equal(t, []string{"AAAA", "BBBB"}, brands)
|
||||
|
||||
_, found = detectFileTypeBox([]byte("\x00\x00\x00\x14ftypAAAA....BBB"))
|
||||
assert.False(t, found)
|
||||
|
||||
brands, found = detectFileTypeBox([]byte("\x00\x00\x00\x13ftypAAAA....BBB"))
|
||||
assert.True(t, found)
|
||||
assert.Equal(t, []string{"AAAA"}, brands)
|
||||
}
|
||||
|
||||
func TestDetectContentTypeAvif(t *testing.T) {
|
||||
buf := []byte("\x00\x00\x00\x20ftypavif.......................")
|
||||
st := DetectContentType(buf)
|
||||
assert.Equal(t, MimeTypeImageAvif, st.contentType)
|
||||
}
|
||||
|
||||
func TestDetectContentTypeIncorrectFont(t *testing.T) {
|
||||
s := "Stupid Golang keep detecting 34th LP as font"
|
||||
// They don't want to have any improvement to it: https://github.com/golang/go/issues/77172
|
||||
golangDetected := http.DetectContentType([]byte(s))
|
||||
assert.Equal(t, "application/vnd.ms-fontobject", golangDetected)
|
||||
// We have to make our patch to make it work correctly
|
||||
ourDetected := DetectContentType([]byte(s))
|
||||
assert.Equal(t, "text/plain; charset=utf-8", ourDetected.contentType)
|
||||
|
||||
// For binary content, ensure it still detects as font. The content is from "opensans-regular.eot"
|
||||
b := []byte{
|
||||
0x3d, 0x30, 0x00, 0x00, 0x6b, 0x2f, 0x00, 0x00, 0x02, 0x00, 0x02, 0x00, 0x04, 0x00, 0x00, 0x00,
|
||||
0x02, 0x0b, 0x06, 0x06, 0x03, 0x05, 0x04, 0x02, 0x02, 0x04, 0x01, 0x00, 0x90, 0x01, 0x00, 0x00,
|
||||
0x04, 0x00, 0x4c, 0x50, 0xef, 0x02, 0x00, 0xe0, 0x5b, 0x20, 0x00, 0x40, 0x28, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x9f, 0x01, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x63, 0xf4, 0x17, 0x14,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x12, 0x00, 0x4f, 0x00, 0x70, 0x00, 0x65, 0x00, 0x6e, 0x00, 0x20, 0x00, 0x53, 0x00,
|
||||
}
|
||||
assert.Equal(t, "application/vnd.ms-fontobject", http.DetectContentType(b))
|
||||
assert.Equal(t, "application/vnd.ms-fontobject", DetectContentType(b).contentType)
|
||||
}
|
||||
Reference in New Issue
Block a user