Files
2026-05-30 22:47:36 +08:00

138 lines
5.5 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package incoming
import (
"regexp"
"strings"
"sync"
"gitea.dev/modules/util"
)
const (
yearToken = `\b\d{4}\b` // 4-digit year
timeToken = `\b\d{1,2}[:.]\d{2}\b` // HH:MM or HH.MM
// "wrote" verbs ending an attribution line; CJK ones are matched without a
// preceding word-separator since those scripts don't space their words
wroteVerbs = `wrote|writes|schrieb|skrev|napisał|escreveu|escribió|написал|пише|a écrit`
cjkWroteVerbs = `写道|寫道|書きました|작성`
// device names anchoring CJK mobile signatures, so prose isn't mistaken for one
cjkDevice = `iphone|ipad|ipod|android|galaxy|手机|手機|平板`
)
// forwarded-mail header fields across the common mail clients/locales. headerFromFields
// (the "From"-equivalents) must begin a block; headerFields is the full set allowed to
// follow. Matched as a prefix by headerLine, so adding a locale is a one-line change.
var (
headerFromFields = []string{
"from", "fra", "de", "von", "da", "van", "från", "expéditeur",
"发件人", "寄件者", "差出人", "보낸사람",
}
headerFields = append([]string{
"to", "cc", "bcc", "sent", "date", "subject", "reply-to",
"til", "emne", "an", "betreff", "gesendet", "para", "assunto", "asunto",
"risposta", "inviato", "oggetto", "destinataire", "objet", "répondre à",
"aan", "onderwerp", "beantwoorden", "skickat", "till", "ämne",
"收件人", "主题", "主旨", "主題", "收件者", "抄送", "日期", "宛先", "件名", "받는사람", "제목",
}, headerFromFields...)
)
// patterns are compiled on first use so the incoming-mail feature adds nothing to startup.
var patterns = sync.OnceValue(func() (ret struct {
signature, attribution, separator *regexp.Regexp
},
) {
// "-- " delimiter and common mobile footers with frequent localizations. The CJK
// forms require a device name so ordinary prose like "发自我的内心" or "会議から送信"
// is not mistaken for a signature.
ret.signature = regexp.MustCompile(`(?i)^(--|__|—` +
`|sent (from|via|with) .+|get outlook for .+` +
`|envoyé depuis mon .+|sendt fra min .+|von meinem .+|verzonden (met|vanaf) .+` +
`|(發|发)自我的.*(` + cjkDevice + `).*` +
`|.*(` + cjkDevice + `).*(から送信|에서 보냄|傳送|发送))$`)
// attribution introducing quoted history: a line ending in a "wrote:" verb
// (Latin/Cyrillic or CJK), a "Name <email> wrote" line, a lead word directly
// followed by a day number or weekday plus a year and a time, or an ISO-date-led
// line. The date phrasing, trailing colon and the email before the verb guard
// against prose (so "On the 2024 roadmap … at 10:00" is not an attribution).
ret.attribution = regexp.MustCompile(`(?i)^>*\s*(` +
`.*[\s">'](` + wroteVerbs + `)\s*[:]` +
`|.*(` + cjkWroteVerbs + `)\s*[:]` +
`|.*<\S+@\S+>\s+(` + wroteVerbs + `)\b.*` +
`|(on|at|le|am|el|em|den|il|op|dnia|w dniu)\b[\s,]*(\d|(?:mon|tue|wed|thu|fri|sat|sun)\b).*` + yearToken + `.*` + timeToken + `.*` +
`|\d{4}-\d{2}-\d{2}\b.*` + timeToken + `.*` +
`)$`)
// a dash/underscore rule line, or text fenced by dashes such as
// "-------- Original Message --------" or "-----Mensaje original-----"
ret.separator = regexp.MustCompile(`(?i)^\s*\*?\s*([-_]{5,}|-{2,}.+-{2,}|original message|forwarded message)\s*\*?\s*$`)
return ret
})
// extractReply returns the user-written part of a plain-text email body, dropping
// quoted history, the reply attribution, signatures and forwarded headers. It is a
// slim, dependency-free reimplementation based on github.com/dimiro1/reply (MIT),
// covering the common mail-client formats and languages; bottom posting and
// forwarded bodies are not handled.
func extractReply(text string) string {
p := patterns()
lines := strings.Split(util.NormalizeStringEOL(text), "\n")
// cut at the first line that begins quoted history, a signature or a header block
for i := range lines {
trimmed := strings.TrimSpace(lines[i])
if p.signature.MatchString(trimmed) || p.attribution.MatchString(trimmed) ||
p.separator.MatchString(trimmed) || headerBlock(trimmed, lines[i+1:]) {
lines = lines[:i]
break
}
}
// drop the trailing block of quoted/blank lines, unless the whole body is quoted
end := len(lines)
for end > 0 {
// "ᐧ" is the trailing marker some mobile clients (Mailbox) append
if t := strings.TrimSpace(lines[end-1]); t != "" && t != "ᐧ" && !strings.HasPrefix(t, ">") {
break
}
end--
}
if end > 0 {
lines = lines[:end]
}
return strings.TrimSpace(strings.Join(lines, "\n"))
}
// headerBlock reports whether a forwarded-mail header block starts here: the
// (already-trimmed) first line is a "From" field and the next non-blank line is
// another field, so a lone "Subject:" sentence is not a boundary.
func headerBlock(first string, rest []string) bool {
if !headerLine(first, headerFromFields) {
return false
}
for _, next := range rest {
if t := strings.TrimSpace(next); t != "" {
return headerLine(t, headerFields)
}
}
return false
}
// headerLine reports whether the already-trimmed line is a "Field:" header for one
// of fields. An ASCII colon must be followed by a space so prose like "To:do this"
// is ignored; the CJK fullwidth colon "" needs no space.
func headerLine(line string, fields []string) bool {
lower := strings.ToLower(line)
for _, field := range fields {
if rest, ok := strings.CutPrefix(lower, field); ok &&
(strings.HasPrefix(rest, ": ") || strings.HasPrefix(rest, "")) {
return true
}
}
return false
}