初始提交: Gitea 项目代码

2026-05-30 22:47:36 +08:00
commit f288f76350
6116 changed files with 776822 additions and 0 deletions
@@ -0,0 +1,409 @@
+// Copyright 2023 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"net/url"
+	"slices"
+	"strconv"
+	"strings"
+	"time"
+
+	"gitea.dev/modules/indexer/internal"
+	"gitea.dev/modules/json"
+)
+
+var _ internal.Indexer = &Indexer{}
+
+// Indexer is a narrow wrapper around an Elasticsearch/OpenSearch cluster.
+// It targets the REST subset shared by Elasticsearch 7/8/9 and OpenSearch 3.
+type Indexer struct {
+	client *http.Client
+	base   string // base URL with trailing slash, no userinfo
+	user   string
+	pass   string
+
+	indexName string
+	version   int
+	mapping   string
+}
+
+// NewIndexer builds an Indexer. The connection is opened by Init.
+func NewIndexer(rawURL, indexName string, version int, mapping string) *Indexer {
+	return &Indexer{
+		base:      rawURL,
+		indexName: indexName,
+		version:   version,
+		mapping:   mapping,
+	}
+}
+
+// Init connects and creates the versioned index if missing, returning true if it already existed.
+func (i *Indexer) Init(ctx context.Context) (bool, error) {
+	parsed, err := url.Parse(i.base)
+	if err != nil {
+		return false, fmt.Errorf("parse elasticsearch url: %w", err)
+	}
+	if parsed.User != nil {
+		i.user = parsed.User.Username()
+		i.pass, _ = parsed.User.Password()
+		parsed.User = nil
+	}
+	base := parsed.String()
+	if !strings.HasSuffix(base, "/") {
+		base += "/"
+	}
+	i.base = base
+	// No client-level Timeout: bulk/_delete_by_query can legitimately run for
+	// minutes on large repos. Per-request deadlines come from the caller's ctx;
+	// transport-level timeouts cover stalled connects/handshakes/headers so a
+	// half-open server cannot wedge the indexer indefinitely.
+	i.client = &http.Client{
+		Transport: &http.Transport{
+			Proxy:                 http.ProxyFromEnvironment,
+			DialContext:           (&net.Dialer{Timeout: 30 * time.Second, KeepAlive: 30 * time.Second}).DialContext,
+			TLSHandshakeTimeout:   10 * time.Second,
+			ResponseHeaderTimeout: 30 * time.Second,
+			ExpectContinueTimeout: 1 * time.Second,
+			IdleConnTimeout:       90 * time.Second,
+			MaxIdleConns:          100,
+		},
+	}
+
+	exists, err := i.indexExists(ctx, i.VersionedIndexName())
+	if err != nil {
+		return false, err
+	}
+	if exists {
+		return true, nil
+	}
+
+	if err := i.createIndex(ctx); err != nil {
+		return false, err
+	}
+
+	return false, nil
+}
+
+// Ping returns an error when the cluster is unusable (status != green/yellow).
+func (i *Indexer) Ping(ctx context.Context) error {
+	var body struct {
+		Status string `json:"status"`
+	}
+	if err := i.doJSON(ctx, http.MethodGet, "_cluster/health", nil, &body); err != nil {
+		return err
+	}
+	// Healthy = green; usable = yellow. Red is unusable.
+	// https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-health.html
+	if body.Status != "green" && body.Status != "yellow" {
+		return fmt.Errorf("status of elasticsearch cluster is %s", body.Status)
+	}
+	return nil
+}
+
+// Close releases idle HTTP connections held by the client.
+func (i *Indexer) Close() {
+	if i == nil || i.client == nil {
+		return
+	}
+	i.client.CloseIdleConnections()
+	i.client = nil
+}
+
+// Bulk submits index/delete ops. Returns the first item-level failure, if any.
+func (i *Indexer) Bulk(ctx context.Context, ops []BulkOp) error {
+	if len(ops) == 0 {
+		return nil
+	}
+
+	index := i.VersionedIndexName()
+	var buf bytes.Buffer
+	buf.Grow(len(ops) * 256)
+	for _, op := range ops {
+		meta := map[string]any{op.action: map[string]any{"_index": index, "_id": op.id}}
+		if err := writeJSONLine(&buf, meta); err != nil {
+			return err
+		}
+		if op.action == bulkActionIndex {
+			if err := writeJSONLine(&buf, op.doc); err != nil {
+				return err
+			}
+		}
+	}
+
+	res, err := i.do(ctx, http.MethodPost, urlPath(index, "_bulk"), "application/x-ndjson", bytes.NewReader(buf.Bytes()))
+	if err != nil {
+		return err
+	}
+	defer drainAndClose(res)
+
+	var body struct {
+		Errors bool `json:"errors"`
+		Items  []map[string]struct {
+			Status int        `json:"status"`
+			Error  json.Value `json:"error"`
+		} `json:"items"`
+	}
+	if err := json.NewDecoder(res.Body).Decode(&body); err != nil {
+		return err
+	}
+	if !body.Errors {
+		return nil
+	}
+	return firstBulkError(body.Items)
+}
+
+// firstBulkError returns the first item-level failure in a bulk response.
+// Each items entry is a single-key map ({"index": {...}} or {"delete": {...}}).
+// Delete-of-missing (404) is idempotent and not reported.
+func firstBulkError(items []map[string]struct {
+	Status int        `json:"status"`
+	Error  json.Value `json:"error"`
+},
+) error {
+	for _, item := range items {
+		for action, result := range item {
+			if action == bulkActionDelete && result.Status == http.StatusNotFound {
+				continue
+			}
+			if result.Status >= 300 {
+				return fmt.Errorf("bulk %s failed (status %d): %s", action, result.Status, string(result.Error))
+			}
+		}
+	}
+	return nil
+}
+
+// Index writes a single document.
+func (i *Indexer) Index(ctx context.Context, id string, doc any) error {
+	body, err := json.Marshal(doc)
+	if err != nil {
+		return err
+	}
+	return i.doJSON(ctx, http.MethodPut, urlPath(i.VersionedIndexName(), "_doc", id), bytes.NewReader(body), nil)
+}
+
+// Delete removes a single document by id. Missing ids are not an error.
+func (i *Indexer) Delete(ctx context.Context, id string) error {
+	res, err := i.do(ctx, http.MethodDelete, urlPath(i.VersionedIndexName(), "_doc", id), "", nil, http.StatusNotFound)
+	if err != nil {
+		return err
+	}
+	drainAndClose(res)
+	return nil
+}
+
+// DeleteByQuery removes every document matching the query.
+func (i *Indexer) DeleteByQuery(ctx context.Context, query Query) error {
+	body, err := json.Marshal(map[string]any{"query": query.querySource()})
+	if err != nil {
+		return err
+	}
+	return i.doJSON(ctx, http.MethodPost, urlPath(i.VersionedIndexName(), "_delete_by_query"), bytes.NewReader(body), nil)
+}
+
+// Refresh forces a refresh so recent writes are searchable.
+func (i *Indexer) Refresh(ctx context.Context) error {
+	return i.doJSON(ctx, http.MethodPost, urlPath(i.VersionedIndexName(), "_refresh"), nil, nil)
+}
+
+// Search runs a search request and decodes the reply.
+func (i *Indexer) Search(ctx context.Context, req SearchRequest) (*SearchResponse, error) {
+	body := map[string]any{}
+	if req.Query != nil {
+		body["query"] = req.Query.querySource()
+	}
+	if len(req.Sort) > 0 {
+		sorts := make([]map[string]any, len(req.Sort))
+		for idx, s := range req.Sort {
+			sorts[idx] = s.source()
+		}
+		body["sort"] = sorts
+	}
+	if req.From > 0 {
+		body["from"] = req.From
+	}
+	body["size"] = req.Size
+	if len(req.Aggregations) > 0 {
+		body["aggs"] = req.Aggregations
+	}
+	if len(req.Highlight) > 0 {
+		body["highlight"] = req.Highlight
+	}
+
+	payload, err := json.Marshal(body)
+	if err != nil {
+		return nil, err
+	}
+
+	// Default track_total_hits is 10000 (capped count); send it explicitly so
+	// callers can choose between exact totals (true) and skipping counting (false).
+	path := urlPath(i.VersionedIndexName(), "_search") + "?track_total_hits=" + strconv.FormatBool(req.TrackTotal)
+	res, err := i.do(ctx, http.MethodPost, path, "application/json", bytes.NewReader(payload))
+	if err != nil {
+		return nil, err
+	}
+	defer drainAndClose(res)
+	return decodeSearchResponse(res.Body)
+}
+
+func (i *Indexer) indexExists(ctx context.Context, name string) (bool, error) {
+	res, err := i.do(ctx, http.MethodHead, urlPath(name), "", nil, http.StatusNotFound)
+	if err != nil {
+		return false, err
+	}
+	drainAndClose(res)
+	return res.StatusCode == http.StatusOK, nil
+}
+
+func (i *Indexer) createIndex(ctx context.Context) error {
+	var body struct {
+		Acknowledged bool `json:"acknowledged"`
+	}
+	if err := i.doJSON(ctx, http.MethodPut, urlPath(i.VersionedIndexName()), bytes.NewBufferString(i.mapping), &body); err != nil {
+		return fmt.Errorf("create index %s: %w", i.VersionedIndexName(), err)
+	}
+	if !body.Acknowledged {
+		return fmt.Errorf("create index %s not acknowledged", i.VersionedIndexName())
+	}
+
+	i.checkOldIndexes(ctx)
+	return nil
+}
+
+// do sends a request and returns the response. Status >= 300 is turned into
+// an error unless the status appears in okStatus. The caller closes Body.
+func (i *Indexer) do(ctx context.Context, method, path, contentType string, body io.Reader, okStatus ...int) (*http.Response, error) {
+	req, err := http.NewRequestWithContext(ctx, method, i.base+path, body)
+	if err != nil {
+		return nil, err
+	}
+	if contentType != "" {
+		req.Header.Set("Content-Type", contentType)
+	}
+	if i.user != "" || i.pass != "" {
+		req.SetBasicAuth(i.user, i.pass)
+	}
+	res, err := i.client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	if res.StatusCode >= 300 && !slices.Contains(okStatus, res.StatusCode) {
+		msg := readErrBody(res)
+		res.Body.Close()
+		return nil, fmt.Errorf("%s %s: %s", method, path, msg)
+	}
+	return res, nil
+}
+
+// doJSON sends a request with a JSON body and, when out is non-nil, decodes
+// the JSON response into it.
+func (i *Indexer) doJSON(ctx context.Context, method, path string, body io.Reader, out any) error {
+	contentType := ""
+	if body != nil {
+		contentType = "application/json"
+	}
+	res, err := i.do(ctx, method, path, contentType, body)
+	if err != nil {
+		return err
+	}
+	defer drainAndClose(res)
+	if out == nil {
+		return nil
+	}
+	return json.NewDecoder(res.Body).Decode(out)
+}
+
+// drainAndClose discards any unread response body before closing so the
+// underlying TCP connection can be reused for keep-alive.
+func drainAndClose(res *http.Response) {
+	_, _ = io.Copy(io.Discard, res.Body)
+	res.Body.Close()
+}
+
+func writeJSONLine(buf *bytes.Buffer, v any) error {
+	enc, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+	buf.Write(enc)
+	buf.WriteByte('\n')
+	return nil
+}
+
+// readErrBody reads up to 4 KiB of an error response and drains the rest so
+// the underlying connection can be reused (keep-alive needs Body fully read).
+func readErrBody(res *http.Response) string {
+	const limit = 4 << 10
+	b, _ := io.ReadAll(io.LimitReader(res.Body, limit))
+	_, _ = io.Copy(io.Discard, res.Body)
+	return fmt.Sprintf("status %d: %s", res.StatusCode, bytes.TrimSpace(b))
+}
+
+func decodeSearchResponse(r io.Reader) (*SearchResponse, error) {
+	var raw struct {
+		Hits struct {
+			Total struct {
+				Value int64 `json:"value"`
+			} `json:"total"`
+			Hits []struct {
+				ID        string              `json:"_id"`
+				Score     float64             `json:"_score"`
+				Source    json.Value          `json:"_source"`
+				Highlight map[string][]string `json:"highlight"`
+			} `json:"hits"`
+		} `json:"hits"`
+		Aggregations map[string]struct {
+			Buckets []struct {
+				Key      any   `json:"key"`
+				DocCount int64 `json:"doc_count"`
+			} `json:"buckets"`
+		} `json:"aggregations"`
+	}
+	if err := json.NewDecoder(r).Decode(&raw); err != nil {
+		return nil, err
+	}
+
+	resp := &SearchResponse{
+		Total: raw.Hits.Total.Value,
+		Hits:  make([]SearchHit, 0, len(raw.Hits.Hits)),
+	}
+	for _, h := range raw.Hits.Hits {
+		resp.Hits = append(resp.Hits, SearchHit{
+			ID:        h.ID,
+			Score:     h.Score,
+			Source:    h.Source,
+			Highlight: h.Highlight,
+		})
+	}
+	if len(raw.Aggregations) > 0 {
+		resp.Aggregations = make(map[string][]AggBucket, len(raw.Aggregations))
+		for name, agg := range raw.Aggregations {
+			buckets := make([]AggBucket, len(agg.Buckets))
+			for idx, b := range agg.Buckets {
+				buckets[idx] = AggBucket{Key: b.Key, DocCount: b.DocCount}
+			}
+			resp.Aggregations[name] = buckets
+		}
+	}
+	return resp, nil
+}
+
+// urlPath joins path segments with `/` and percent-escapes each.
+func urlPath(segments ...string) string {
+	var b bytes.Buffer
+	for idx, s := range segments {
+		if idx > 0 {
+			b.WriteByte('/')
+		}
+		b.WriteString(url.PathEscape(s))
+	}
+	return b.String()
+}
@@ -0,0 +1,39 @@
+// Copyright 2026 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+import (
+	"strings"
+	"testing"
+
+	"gitea.dev/modules/test"
+
+	"github.com/stretchr/testify/require"
+)
+
+func newRealIndexer(t *testing.T) *Indexer {
+	t.Helper()
+	esURL := test.ExternalServiceHTTP(t, "TEST_ELASTICSEARCH_URL", "http://elasticsearch:9200")
+	indexName := "gitea_test_" + strings.ReplaceAll(strings.ToLower(t.Name()), "/", "_")
+	ix := NewIndexer(esURL, indexName, 1, `{"mappings":{"properties":{"x":{"type":"keyword"}}}}`)
+	_, err := ix.Init(t.Context())
+	require.NoError(t, err)
+	t.Cleanup(ix.Close)
+	return ix
+}
+
+func TestPing(t *testing.T) {
+	ix := newRealIndexer(t)
+	require.NoError(t, ix.Ping(t.Context()))
+}
+
+func TestDeleteSwallows404(t *testing.T) {
+	ix := newRealIndexer(t)
+	require.NoError(t, ix.Delete(t.Context(), "missing-id"))
+}
+
+func TestBulkAcceptsDelete404(t *testing.T) {
+	ix := newRealIndexer(t)
+	require.NoError(t, ix.Bulk(t.Context(), []BulkOp{DeleteOp("missing-id")}))
+}
@@ -0,0 +1,132 @@
+// Copyright 2026 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+// MultiMatch types used by the call sites. See
+// https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
+const (
+	MultiMatchTypeBestFields   = "best_fields"
+	MultiMatchTypePhrasePrefix = "phrase_prefix"
+)
+
+// ToAnySlice converts []T to []any for variadic query args like TermsQuery.
+func ToAnySlice[T any](s []T) []any {
+	out := make([]any, len(s))
+	for idx, v := range s {
+		out[idx] = v
+	}
+	return out
+}
+
+// Query is an Elasticsearch query DSL node. It marshals to the JSON
+// object expected by the ES query API.
+type Query interface {
+	querySource() map[string]any
+}
+
+type rawQuery map[string]any
+
+func (q rawQuery) querySource() map[string]any { return q }
+
+// TermQuery matches documents whose `field` exactly equals `value`.
+func TermQuery(field string, value any) Query {
+	return rawQuery{"term": map[string]any{field: value}}
+}
+
+// TermsQuery matches documents whose `field` equals any of `values`.
+func TermsQuery(field string, values ...any) Query {
+	return rawQuery{"terms": map[string]any{field: values}}
+}
+
+// MatchQuery is a full-text match on a single field.
+func MatchQuery(field string, value any) Query {
+	return rawQuery{"match": map[string]any{field: value}}
+}
+
+// MatchPhraseQuery matches the exact phrase on `field`.
+func MatchPhraseQuery(field, value string) Query {
+	return rawQuery{"match_phrase": map[string]any{field: value}}
+}
+
+// MultiMatchQuery is the fluent builder for a multi_match query.
+type MultiMatchQuery struct {
+	query    any
+	fields   []string
+	typ      string
+	operator string
+}
+
+// NewMultiMatchQuery creates a multi_match query over the given fields.
+func NewMultiMatchQuery(query any, fields ...string) *MultiMatchQuery {
+	return &MultiMatchQuery{query: query, fields: fields}
+}
+
+func (m *MultiMatchQuery) Type(t string) *MultiMatchQuery      { m.typ = t; return m }
+func (m *MultiMatchQuery) Operator(op string) *MultiMatchQuery { m.operator = op; return m }
+
+func (m *MultiMatchQuery) querySource() map[string]any {
+	body := map[string]any{"query": m.query}
+	if len(m.fields) > 0 {
+		body["fields"] = m.fields
+	}
+	if m.typ != "" {
+		body["type"] = m.typ
+	}
+	if m.operator != "" {
+		body["operator"] = m.operator
+	}
+	return map[string]any{"multi_match": body}
+}
+
+// RangeQuery is the fluent builder for a range query.
+type RangeQuery struct {
+	field string
+	body  map[string]any
+}
+
+func NewRangeQuery(field string) *RangeQuery {
+	return &RangeQuery{field: field, body: map[string]any{}}
+}
+
+func (r *RangeQuery) Gte(v any) *RangeQuery { r.body["gte"] = v; return r }
+func (r *RangeQuery) Lte(v any) *RangeQuery { r.body["lte"] = v; return r }
+
+func (r *RangeQuery) querySource() map[string]any {
+	return map[string]any{"range": map[string]any{r.field: r.body}}
+}
+
+// BoolQuery is the fluent builder for a bool query.
+type BoolQuery struct {
+	must    []Query
+	should  []Query
+	mustNot []Query
+}
+
+func NewBoolQuery() *BoolQuery { return &BoolQuery{} }
+
+func (b *BoolQuery) Must(q ...Query) *BoolQuery    { b.must = append(b.must, q...); return b }
+func (b *BoolQuery) Should(q ...Query) *BoolQuery  { b.should = append(b.should, q...); return b }
+func (b *BoolQuery) MustNot(q ...Query) *BoolQuery { b.mustNot = append(b.mustNot, q...); return b }
+
+func (b *BoolQuery) querySource() map[string]any {
+	body := map[string]any{}
+	if len(b.must) > 0 {
+		body["must"] = querySlice(b.must)
+	}
+	if len(b.should) > 0 {
+		body["should"] = querySlice(b.should)
+	}
+	if len(b.mustNot) > 0 {
+		body["must_not"] = querySlice(b.mustNot)
+	}
+	return map[string]any{"bool": body}
+}
+
+func querySlice(queries []Query) []map[string]any {
+	out := make([]map[string]any, len(queries))
+	for idx, q := range queries {
+		out[idx] = q.querySource()
+	}
+	return out
+}
@@ -0,0 +1,76 @@
+// Copyright 2026 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+import "gitea.dev/modules/json"
+
+const (
+	bulkActionIndex  = "index"
+	bulkActionDelete = "delete"
+)
+
+// BulkOp is a single write inside a Bulk call. Construct with IndexOp or DeleteOp.
+type BulkOp struct {
+	action string
+	id     string
+	doc    any
+}
+
+// IndexOp builds a bulk index operation.
+func IndexOp(id string, doc any) BulkOp {
+	return BulkOp{action: bulkActionIndex, id: id, doc: doc}
+}
+
+// DeleteOp builds a bulk delete operation.
+func DeleteOp(id string) BulkOp {
+	return BulkOp{action: bulkActionDelete, id: id}
+}
+
+// SortField is one entry of the search sort array.
+type SortField struct {
+	Field string
+	Desc  bool
+}
+
+func (s SortField) source() map[string]any {
+	order := "asc"
+	if s.Desc {
+		order = "desc"
+	}
+	return map[string]any{s.Field: map[string]any{"order": order}}
+}
+
+// SearchRequest captures everything Gitea sends to the _search endpoint.
+// Aggregations and Highlight are raw ES JSON bodies — callers write them as
+// map[string]any since each has exactly one call site with a fixed shape.
+type SearchRequest struct {
+	Query        Query
+	Sort         []SortField
+	From         int
+	Size         int
+	TrackTotal   bool
+	Aggregations map[string]any
+	Highlight    map[string]any
+}
+
+// SearchHit is a single result row.
+type SearchHit struct {
+	ID        string
+	Score     float64
+	Source    json.Value
+	Highlight map[string][]string
+}
+
+// AggBucket is a terms-aggregation bucket.
+type AggBucket struct {
+	Key      any
+	DocCount int64
+}
+
+// SearchResponse is Gitea's decoded view of the search reply.
+type SearchResponse struct {
+	Total        int64
+	Hits         []SearchHit
+	Aggregations map[string][]AggBucket
+}
@@ -0,0 +1,34 @@
+// Copyright 2023 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+import (
+	"context"
+	"fmt"
+
+	"gitea.dev/modules/log"
+)
+
+// VersionedIndexName returns the full index name with version suffix.
+func (i *Indexer) VersionedIndexName() string {
+	return versionedIndexName(i.indexName, i.version)
+}
+
+func versionedIndexName(indexName string, version int) string {
+	if version == 0 {
+		// Old index name without version
+		return indexName
+	}
+	return fmt.Sprintf("%s.v%d", indexName, version)
+}
+
+func (i *Indexer) checkOldIndexes(ctx context.Context) {
+	for v := range i.version {
+		indexName := versionedIndexName(i.indexName, v)
+		exists, err := i.indexExists(ctx, indexName)
+		if err == nil && exists {
+			log.Warn("Found older elasticsearch index named %q, Gitea will keep the old NOT DELETED. You can delete the old version after the upgrade succeed.", indexName)
+		}
+	}
+}