初始提交: Gitea 项目代码

This commit is contained in:
root
2026-05-30 22:47:36 +08:00
commit f288f76350
6116 changed files with 776822 additions and 0 deletions
+392
View File
@@ -0,0 +1,392 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"context"
"fmt"
"io"
"strconv"
"strings"
"time"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/analyze"
"gitea.dev/modules/charset"
"gitea.dev/modules/git"
"gitea.dev/modules/git/gitcmd"
"gitea.dev/modules/gitrepo"
"gitea.dev/modules/indexer"
path_filter "gitea.dev/modules/indexer/code/bleve/token/path"
"gitea.dev/modules/indexer/code/internal"
indexer_internal "gitea.dev/modules/indexer/internal"
inner_bleve "gitea.dev/modules/indexer/internal/bleve"
"gitea.dev/modules/setting"
"gitea.dev/modules/timeutil"
"gitea.dev/modules/typesniffer"
"gitea.dev/modules/util"
"github.com/blevesearch/bleve/v2"
analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/go-enry/go-enry/v2"
)
const (
unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16
)
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
"type": unicodenorm.Name,
"form": unicodenorm.NFC,
})
}
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
RepoID int64
CommitID string
Content string
Filename string
Language string
UpdatedAt time.Time
}
// Type returns the document type, for bleve's mapping.Classifier interface.
func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 9
)
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
func generateBleveIndexMapping() (mapping.IndexMapping, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
fileNamedMapping := bleve.NewTextFieldMapping()
fileNamedMapping.IncludeInAll = false
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
timeFieldMapping := bleve.NewDateTimeFieldMapping()
timeFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": letter.Name,
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
}); err != nil {
return nil, err
}
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
}); err != nil {
return nil, err
}
mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
return mapping, nil
}
var _ internal.Indexer = &Indexer{}
// Indexer represents a bleve indexer implementation
type Indexer struct {
inner *inner_bleve.Indexer
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWords()
}
// NewIndexer creates a new bleve local indexer
func NewIndexer(indexDir string) *Indexer {
inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
return &Indexer{
Indexer: inner,
inner: inner,
}
}
func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch, commitSha string,
update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
return nil
}
size := update.Size
var err error
if !update.Sized {
var stdout string
stdout, _, err = gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha))
if err != nil {
return err
}
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
return fmt.Errorf("misformatted git cat-file output: %w", err)
}
}
if size > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
}
info, batchReader, err := catFileBatch.QueryContent(update.BlobSha)
if err != nil {
return err
}
fileContents, err := io.ReadAll(io.LimitReader(batchReader, info.Size))
if err != nil {
return err
} else if !typesniffer.DetectContentType(fileContents).IsText() {
// FIXME: UTF-16 files will probably fail here
// Even if the file is not recognized as a "text file", we could still put its name into the indexers to make the filename become searchable, while leave the content to empty.
fileContents = nil
}
if _, err = batchReader.Discard(1); err != nil {
return err
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
id := internal.FilenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}
// Index indexes the data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
if len(changes.Updates) > 0 {
catfileBatch, err := gitrepo.NewBatch(ctx, repo)
if err != nil {
return err
}
defer catfileBatch.Close()
for _, update := range changes.Updates {
if err := b.addUpdate(ctx, catfileBatch, sha, update, repo, batch); err != nil {
return err
}
}
}
for _, filename := range changes.RemovedFilenames {
if err := b.addDelete(filename, repo, batch); err != nil {
return err
}
}
return batch.Flush()
}
// Delete deletes indexes by ids
func (b *Indexer) Delete(_ context.Context, repoID int64) error {
query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
result, err := b.inner.Indexer.Search(searchRequest)
if err != nil {
return err
}
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
for _, hit := range result.Hits {
if err = batch.Delete(hit.ID); err != nil {
return err
}
}
return batch.Flush()
}
// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var (
indexerQuery query.Query
keywordQuery query.Query
contentQuery query.Query
)
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
pathQuery.FieldVal = "Filename"
pathQuery.SetBoost(10)
searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
if searchMode == indexer.SearchModeExact {
// 1.21 used NewPrefixQuery, but it seems not working well, and later releases changed to NewMatchPhraseQuery
q := bleve.NewMatchPhraseQuery(opts.Keyword)
q.Analyzer = repoIndexerAnalyzer
q.FieldVal = "Content"
contentQuery = q
} else /* words */ {
q := bleve.NewMatchQuery(opts.Keyword)
q.FieldVal = "Content"
q.Analyzer = repoIndexerAnalyzer
if searchMode == indexer.SearchModeFuzzy {
// this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case.
q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
} else {
q.Operator = query.MatchQueryOperatorAnd
}
contentQuery = q
}
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
}
indexerQuery = bleve.NewConjunctionQuery(
bleve.NewDisjunctionQuery(repoQueries...),
keywordQuery,
)
} else {
indexerQuery = keywordQuery
}
// Save for reuse without language filter
facetQuery := indexerQuery
if len(opts.Language) > 0 {
languageQuery := bleve.NewMatchQuery(opts.Language)
languageQuery.FieldVal = "Language"
languageQuery.Analyzer = analyzer_keyword.Name
indexerQuery = bleve.NewConjunctionQuery(
indexerQuery,
languageQuery,
)
}
from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
if len(opts.Language) == 0 {
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
}
searchRequest.SortBy([]string{"-_score", "UpdatedAt"})
result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
if err != nil {
return 0, nil, nil, err
}
total := int64(result.Total)
searchResults := make([]*internal.SearchResult, len(result.Hits))
for i, hit := range result.Hits {
startIndex, endIndex := -1, -1
for _, locations := range hit.Locations["Content"] {
location := locations[0]
locationStart := int(location.Start)
locationEnd := int(location.End)
if startIndex < 0 || locationStart < startIndex {
startIndex = locationStart
}
if endIndex < 0 || locationEnd > endIndex {
endIndex = locationEnd
}
}
if len(hit.Locations["Filename"]) > 0 {
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
}
language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
updatedUnix = timeutil.TimeStamp(t.Unix())
}
searchResults[i] = &internal.SearchResult{
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: internal.FilenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
CommitID: hit.Fields["CommitID"].(string),
UpdatedUnix: updatedUnix,
Language: language,
Color: enry.GetColor(language),
}
}
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
if len(opts.Language) > 0 {
// Use separate query to go get all language counts
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
facetRequest.IncludeLocations = true
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
return 0, nil, nil, err
}
}
languagesFacet := result.Facets["languages"]
for _, term := range languagesFacet.Terms.Terms() {
if len(term.Term) == 0 {
continue
}
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
Language: term.Term,
Color: enry.GetColor(term.Term),
Count: term.Count,
})
}
return total, searchResults, searchResultLanguages, nil
}
@@ -0,0 +1,105 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package path
import (
"slices"
"strings"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const (
Name = "gitea/path"
)
type TokenFilter struct{}
func NewTokenFilter() *TokenFilter {
return &TokenFilter{}
}
func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewTokenFilter(), nil
}
func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
if len(input) == 1 {
// if there is only one token, we don't need to generate the reversed chain
return generatePathTokens(input, false)
}
normal := generatePathTokens(input, false)
reversed := generatePathTokens(input, true)
return append(normal, reversed...)
}
// Generates path tokens from the input tokens.
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
//
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
// to efficiently search for filenames without supplying the fullpath.
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
terms := make([]string, 0, len(input))
longestTerm := 0
if reversed {
slices.Reverse(input)
}
for i := range input {
var sb strings.Builder
sb.Write(input[0].Term)
for j := 1; j < i; j++ {
sb.WriteString("/")
sb.Write(input[j].Term)
}
term := sb.String()
if longestTerm < len(term) {
longestTerm = len(term)
}
terms = append(terms, term)
}
output := make(analysis.TokenStream, 0, len(terms))
for _, term := range terms {
var start, end int
if reversed {
start = 0
end = len(term)
} else {
start = longestTerm - len(term)
end = longestTerm
}
token := analysis.Token{
Position: 1,
Start: start,
End: end,
Type: analysis.AlphaNumeric,
Term: []byte(term),
}
output = append(output, &token)
}
return output
}
func init() {
// FIXME: move it to the bleve's init function, but do not call it in global init
err := registry.RegisterTokenFilter(Name, TokenFilterConstructor)
if err != nil {
panic(err)
}
}
@@ -0,0 +1,76 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package path
import (
"fmt"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/stretchr/testify/assert"
)
type Scenario struct {
Input string
Tokens []string
}
func TestTokenFilter(t *testing.T) {
scenarios := []struct {
Input string
Terms []string
}{
{
Input: "Dockerfile",
Terms: []string{"Dockerfile"},
},
{
Input: "Dockerfile.rootless",
Terms: []string{"Dockerfile.rootless"},
},
{
Input: "a/b/c/Dockerfile.rootless",
Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
},
{
Input: "",
Terms: []string{},
},
}
for _, scenario := range scenarios {
t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
terms := extractTerms(scenario.Input)
assert.Len(t, terms, len(scenario.Terms))
for _, term := range terms {
assert.Contains(t, scenario.Terms, term)
}
})
}
}
func extractTerms(input string) []string {
tokens := tokenize(input)
filteredTokens := filter(tokens)
terms := make([]string, 0, len(filteredTokens))
for _, token := range filteredTokens {
terms = append(terms, string(token.Term))
}
return terms
}
func filter(input analysis.TokenStream) analysis.TokenStream {
filter := NewTokenFilter()
return filter.Filter(input)
}
func tokenize(input string) analysis.TokenStream {
tokenizer := unicode.NewUnicodeTokenizer()
return tokenizer.Tokenize([]byte(input))
}
@@ -0,0 +1,405 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"context"
"fmt"
"io"
"strconv"
"strings"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/analyze"
"gitea.dev/modules/charset"
"gitea.dev/modules/git"
"gitea.dev/modules/git/gitcmd"
"gitea.dev/modules/gitrepo"
"gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/code/internal"
es "gitea.dev/modules/indexer/internal/elasticsearch"
"gitea.dev/modules/json"
"gitea.dev/modules/log"
"gitea.dev/modules/setting"
"gitea.dev/modules/timeutil"
"gitea.dev/modules/typesniffer"
"gitea.dev/modules/util"
"github.com/go-enry/go-enry/v2"
)
const esRepoIndexerLatestVersion = 3
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
*es.Indexer
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWords()
}
// NewIndexer creates a new elasticsearch indexer
func NewIndexer(url, indexerName string) *Indexer {
return &Indexer{Indexer: es.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)}
}
const (
defaultMapping = `{
"settings": {
"analysis": {
"analyzer": {
"content_analyzer": {
"tokenizer": "content_tokenizer",
"filter" : ["lowercase"]
},
"filename_path_analyzer": {
"tokenizer": "path_tokenizer"
},
"reversed_filename_path_analyzer": {
"tokenizer": "reversed_path_tokenizer"
}
},
"tokenizer": {
"content_tokenizer": {
"type": "simple_pattern_split",
"pattern": "[^a-zA-Z0-9]"
},
"path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/"
},
"reversed_path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/",
"reverse": true
}
}
}
},
"mappings": {
"properties": {
"repo_id": {
"type": "long",
"index": true
},
"filename": {
"type": "text",
"term_vector": "with_positions_offsets",
"index": true,
"fields": {
"path": {
"type": "text",
"analyzer": "reversed_filename_path_analyzer"
},
"path_reversed": {
"type": "text",
"analyzer": "filename_path_analyzer"
}
}
},
"content": {
"type": "text",
"term_vector": "with_positions_offsets",
"index": true,
"analyzer": "content_analyzer"
},
"commit_id": {
"type": "keyword",
"index": true
},
"language": {
"type": "keyword",
"index": true
},
"updated_at": {
"type": "long",
"index": true
}
}
}
}`
)
func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]es.BulkOp, error) {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
return nil, nil
}
size := update.Size
var err error
if !update.Sized {
var stdout string
stdout, _, err = gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha))
if err != nil {
return nil, err
}
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
}
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
if size > setting.Indexer.MaxIndexerFileSize {
return []es.BulkOp{es.DeleteOp(id)}, nil
}
info, batchReader, err := catFileBatch.QueryContent(update.BlobSha)
if err != nil {
return nil, err
}
fileContents, err := io.ReadAll(io.LimitReader(batchReader, info.Size))
if err != nil {
return nil, err
} else if !typesniffer.DetectContentType(fileContents).IsText() {
// FIXME: UTF-16 files will probably fail here
return nil, nil
}
if _, err = batchReader.Discard(1); err != nil {
return nil, err
}
return []es.BulkOp{es.IndexOp(id, map[string]any{
"repo_id": repo.ID,
"filename": update.Filename,
"content": string(charset.ToUTF8DropErrors(fileContents)),
"commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
"updated_at": timeutil.TimeStampNow(),
})}, nil
}
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) es.BulkOp {
return es.DeleteOp(internal.FilenameIndexerID(repo.ID, filename))
}
// Index will save the index data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
ops := make([]es.BulkOp, 0)
if len(changes.Updates) > 0 {
batch, err := gitrepo.NewBatch(ctx, repo)
if err != nil {
return err
}
defer batch.Close()
for _, update := range changes.Updates {
updateOps, err := b.addUpdate(ctx, batch, sha, update, repo)
if err != nil {
return err
}
if len(updateOps) > 0 {
ops = append(ops, updateOps...)
}
}
}
for _, filename := range changes.RemovedFilenames {
ops = append(ops, b.addDelete(filename, repo))
}
if len(ops) > 0 {
esBatchSize := 50
for i := 0; i < len(ops); i += esBatchSize {
if err := b.Bulk(ctx, ops[i:min(i+esBatchSize, len(ops))]); err != nil {
return err
}
}
}
return nil
}
// Delete entries by repoId
func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
if err := b.doDelete(ctx, repoID); err != nil {
// Maybe there is a conflict during the delete operation, so we should retry after a refresh
log.Warn("Deletion of entries of repo %v within index %v was erroneous: %v. Trying to refresh index before trying again", repoID, b.VersionedIndexName(), err)
if err := b.Refresh(ctx); err != nil {
return err
}
if err := b.doDelete(ctx, repoID); err != nil {
log.Error("Could not delete entries of repo %v within index %v", repoID, b.VersionedIndexName())
return err
}
}
return nil
}
// Delete entries by repoId
func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
return b.DeleteByQuery(ctx, es.TermsQuery("repo_id", repoID))
}
// contentMatchIndexPos find words positions for start and the following end on content. It will
// return the beginning position of the first start and the ending position of the
// first end following the start string.
// If not found any of the positions, it will return -1, -1.
func contentMatchIndexPos(content, start, end string) (int, int) {
startIdx := strings.Index(content, start)
if startIdx < 0 {
return -1, -1
}
endIdx := strings.Index(content[startIdx+len(start):], end)
if endIdx < 0 {
return -1, -1
}
return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
}
func convertResult(searchResult *es.SearchResponse, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
hits := make([]*internal.SearchResult, 0, pageSize)
for _, hit := range searchResult.Hits {
repoID, fileName := internal.ParseIndexerID(hit.ID)
res := make(map[string]any)
if err := json.Unmarshal(hit.Source, &res); err != nil {
return 0, nil, nil, err
}
// FIXME: There is no way to get the position the keyword on the content currently on the same request.
// So we get it from content, this may made the query slower. See
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
var startIndex, endIndex int
if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
} else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
// now we should find the positions. But how to avoid html content which contains the
// <em> and </em> tags? If elastic search has handled that?
startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
if startIndex == -1 {
panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
}
} else {
panic(fmt.Sprintf("2===%#v", hit.Highlight))
}
language := res["language"].(string)
hits = append(hits, &internal.SearchResult{
RepoID: repoID,
Filename: fileName,
CommitID: res["commit_id"].(string),
Content: res["content"].(string),
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
Language: language,
StartIndex: startIndex,
EndIndex: endIndex,
Color: enry.GetColor(language),
})
}
return searchResult.Total, hits, extractAggs(searchResult), nil
}
func extractAggs(searchResult *es.SearchResponse) []*internal.SearchResultLanguages {
buckets, found := searchResult.Aggregations["language"]
if !found {
return nil
}
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
for _, bucket := range buckets {
// language is mapped as keyword so the key is always a string; if the
// mapping ever changes, skip rather than emit an empty-language bucket.
key, ok := bucket.Key.(string)
if !ok {
continue
}
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
Language: key,
Color: enry.GetColor(key),
Count: int(bucket.DocCount),
})
}
return searchResultLanguages
}
// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
contentQuery := es.Query(es.NewMultiMatchQuery(opts.Keyword, "content").Type(es.MultiMatchTypeBestFields).Operator("and"))
if searchMode == indexer.SearchModeExact {
contentQuery = es.MatchPhraseQuery("content", opts.Keyword)
}
kwQuery := es.NewBoolQuery().Should(
contentQuery,
es.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(es.MultiMatchTypePhrasePrefix),
)
query := es.NewBoolQuery().Must(kwQuery)
if len(opts.RepoIDs) > 0 {
query.Must(es.TermsQuery("repo_id", es.ToAnySlice(opts.RepoIDs)...))
}
start, pageSize := opts.GetSkipTake()
kw := "<em>" + opts.Keyword + "</em>"
languageAggs := map[string]any{
"language": map[string]any{
"terms": map[string]any{
"field": "language",
"size": 10,
"order": map[string]any{"_count": "desc"},
},
},
}
// number_of_fragments=0 returns the full highlighted content (no fragmentation).
highlight := map[string]any{
"fields": map[string]any{
"content": map[string]any{},
"filename": map[string]any{},
},
"number_of_fragments": 0,
"type": "fvh",
}
sort := []es.SortField{
{Field: "_score", Desc: true},
{Field: "updated_at", Desc: false},
}
if len(opts.Language) == 0 {
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Sort: sort,
From: start,
Size: pageSize,
TrackTotal: true,
Aggregations: languageAggs,
Highlight: highlight,
})
if err != nil {
return 0, nil, nil, err
}
return convertResult(resp, kw, pageSize)
}
countResp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Size: 0, // stats only
TrackTotal: true,
Aggregations: languageAggs,
})
if err != nil {
return 0, nil, nil, err
}
query.Must(es.MatchQuery("language", opts.Language))
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Sort: sort,
From: start,
Size: pageSize,
TrackTotal: true,
Highlight: highlight,
})
if err != nil {
return 0, nil, nil, err
}
total, hits, _, err := convertResult(resp, kw, pageSize)
return total, hits, extractAggs(countResp), err
}
@@ -0,0 +1,16 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestIndexPos(t *testing.T) {
startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end")
assert.Equal(t, 11, startIdx)
assert.Equal(t, 15, endIdx)
}
+201
View File
@@ -0,0 +1,201 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package code
import (
"context"
"strconv"
"strings"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/git"
"gitea.dev/modules/git/gitcmd"
"gitea.dev/modules/gitrepo"
"gitea.dev/modules/indexer/code/internal"
"gitea.dev/modules/log"
"gitea.dev/modules/setting"
)
func getDefaultBranchSha(ctx context.Context, repo *repo_model.Repository) (string, error) {
stdout, _, err := gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("show-ref", "-s").AddDynamicArguments(git.BranchPrefix+repo.DefaultBranch))
if err != nil {
return "", err
}
return strings.TrimSpace(stdout), nil
}
// getRepoChanges returns changes to repo since last indexer update
func getRepoChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
status, err := repo_model.GetIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode)
if err != nil {
return nil, err
}
needGenesis := len(status.CommitSha) == 0
if !needGenesis {
hasAncestorCmd := gitcmd.NewCommand("merge-base").AddDynamicArguments(status.CommitSha, revision)
stdout, _, _ := gitrepo.RunCmdString(ctx, repo, hasAncestorCmd) // FIXME: error is not handled
needGenesis = len(stdout) == 0
}
if needGenesis {
return genesisChanges(ctx, repo, revision)
}
return nonGenesisChanges(ctx, repo, revision)
}
func isIndexable(entry *git.TreeEntry) bool {
if !entry.IsRegular() && !entry.IsExecutable() {
return false
}
name := strings.ToLower(entry.Name())
for _, g := range setting.Indexer.ExcludePatterns {
if g.Match(name) {
return false
}
}
for _, g := range setting.Indexer.IncludePatterns {
if g.Match(name) {
return true
}
}
return len(setting.Indexer.IncludePatterns) == 0
}
// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
func parseGitLsTreeOutput(stdout []byte) ([]internal.FileUpdate, error) {
entries, err := git.ParseTreeEntries(stdout)
if err != nil {
return nil, err
}
idxCount := 0
updates := make([]internal.FileUpdate, len(entries))
for _, entry := range entries {
if isIndexable(entry) {
updates[idxCount] = internal.FileUpdate{
Filename: entry.Name(),
BlobSha: entry.ID.String(),
Size: entry.Size(),
Sized: true,
}
idxCount++
}
}
return updates[:idxCount], nil
}
// genesisChanges get changes to add repo to the indexer for the first time
func genesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
var changes internal.RepoChanges
stdout, _, runErr := gitrepo.RunCmdBytes(ctx, repo, gitcmd.NewCommand("ls-tree", "--full-tree", "-l", "-r").AddDynamicArguments(revision))
if runErr != nil {
return nil, runErr
}
var err error
changes.Updates, err = parseGitLsTreeOutput(stdout)
return &changes, err
}
// nonGenesisChanges get changes since the previous indexer update
func nonGenesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
diffCmd := gitcmd.NewCommand("diff", "--name-status").AddDynamicArguments(repo.CodeIndexerStatus.CommitSha, revision)
stdout, _, runErr := gitrepo.RunCmdString(ctx, repo, diffCmd)
if runErr != nil {
// previous commit sha may have been removed by a force push, so
// try rebuilding from scratch
log.Warn("git diff: %v", runErr)
if err := (*globalIndexer.Load()).Delete(ctx, repo.ID); err != nil {
return nil, err
}
return genesisChanges(ctx, repo, revision)
}
var changes internal.RepoChanges
var err error
updatedFilenames := make([]string, 0, 10)
updateChanges := func() error {
cmd := gitcmd.NewCommand("ls-tree", "--full-tree", "-l").AddDynamicArguments(revision).
AddDashesAndList(updatedFilenames...)
lsTreeStdout, _, err := gitrepo.RunCmdBytes(ctx, repo, cmd)
if err != nil {
return err
}
updates, err1 := parseGitLsTreeOutput(lsTreeStdout)
if err1 != nil {
return err1
}
changes.Updates = append(changes.Updates, updates...)
return nil
}
lines := strings.SplitSeq(stdout, "\n")
for line := range lines {
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
}
fields := strings.Split(line, "\t")
if len(fields) < 2 {
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
continue
}
filename := fields[1]
if len(filename) == 0 {
continue
} else if filename[0] == '"' {
filename, err = strconv.Unquote(filename)
if err != nil {
return nil, err
}
}
switch status := fields[0][0]; status {
case 'M', 'A':
updatedFilenames = append(updatedFilenames, filename)
case 'D':
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
case 'R', 'C':
if len(fields) < 3 {
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
continue
}
dest := fields[2]
if len(dest) == 0 {
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
continue
}
if dest[0] == '"' {
dest, err = strconv.Unquote(dest)
if err != nil {
return nil, err
}
}
if status == 'R' {
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
}
updatedFilenames = append(updatedFilenames, dest)
default:
log.Warn("Unrecognized status: %c (line=%s)", status, line)
}
// According to https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/command-line-string-limitation#more-information
// the command line length should less than 8191 characters, assume filepath is 256, then 8191/256 = 31, so we use 30
if len(updatedFilenames) >= 30 {
if err := updateChanges(); err != nil {
return nil, err
}
updatedFilenames = updatedFilenames[0:0]
}
}
if len(updatedFilenames) > 0 {
if err := updateChanges(); err != nil {
return nil, err
}
}
return &changes, err
}
+66
View File
@@ -0,0 +1,66 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package gitgrep
import (
"context"
"fmt"
"strings"
"gitea.dev/modules/git"
"gitea.dev/modules/indexer"
code_indexer "gitea.dev/modules/indexer/code"
"gitea.dev/modules/setting"
)
func indexSettingToGitGrepPathspecList() (list []string) {
for _, expr := range setting.Indexer.IncludePatterns {
list = append(list, ":(glob)"+expr.PatternString())
}
for _, expr := range setting.Indexer.ExcludePatterns {
list = append(list, ":(glob,exclude)"+expr.PatternString())
}
return list
}
func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, searchMode indexer.SearchModeType) (searchResults []*code_indexer.Result, total int64, err error) {
grepMode := git.GrepModeWords
switch searchMode {
case indexer.SearchModeExact:
grepMode = git.GrepModeExact
case indexer.SearchModeRegexp:
grepMode = git.GrepModeRegexp
}
res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{
ContextLineNumber: 1,
GrepMode: grepMode,
RefName: ref.String(),
PathspecList: indexSettingToGitGrepPathspecList(),
})
if err != nil {
// TODO: if no branch exists, it reports: exit status 128, fatal: this operation must be run in a work tree.
return nil, 0, fmt.Errorf("git.GrepSearch: %w", err)
}
commitID, err := gitRepo.GetRefCommitID(ref.String())
if err != nil {
return nil, 0, fmt.Errorf("gitRepo.GetRefCommitID: %w", err)
}
total = int64(len(res))
pageStart := min((page-1)*setting.UI.RepoSearchPagingNum, len(res))
pageEnd := min(page*setting.UI.RepoSearchPagingNum, len(res))
res = res[pageStart:pageEnd]
for _, r := range res {
searchResults = append(searchResults, &code_indexer.Result{
RepoID: repoID,
Filename: r.Filename,
CommitID: commitID,
// UpdatedUnix: not supported yet
// Language: not supported yet
// Color: not supported yet
Lines: code_indexer.HighlightSearchResultCode(r.Filename, "", r.LineNumbers, strings.Join(r.LineCodes, "\n")),
})
}
return searchResults, total, nil
}
@@ -0,0 +1,19 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package gitgrep
import (
"testing"
"gitea.dev/modules/setting"
"gitea.dev/modules/test"
"github.com/stretchr/testify/assert"
)
func TestIndexSettingToGitGrepPathspecList(t *testing.T) {
defer test.MockVariableValue(&setting.Indexer.IncludePatterns, setting.IndexerGlobFromString("a"))()
defer test.MockVariableValue(&setting.Indexer.ExcludePatterns, setting.IndexerGlobFromString("b"))()
assert.Equal(t, []string{":(glob)a", ":(glob,exclude)b"}, indexSettingToGitGrepPathspecList())
}
+314
View File
@@ -0,0 +1,314 @@
// Copyright 2016 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package code
import (
"context"
"os"
"runtime/pprof"
"slices"
"sync/atomic"
"time"
"gitea.dev/models/db"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/graceful"
"gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/code/bleve"
"gitea.dev/modules/indexer/code/elasticsearch"
"gitea.dev/modules/indexer/code/internal"
"gitea.dev/modules/log"
"gitea.dev/modules/process"
"gitea.dev/modules/queue"
"gitea.dev/modules/setting"
"gitea.dev/modules/util"
)
var (
indexerQueue *queue.WorkerPoolQueue[*internal.IndexerData]
// globalIndexer is the global indexer, it cannot be nil.
// When the real indexer is not ready, it will be a dummy indexer which will return error to explain it's not ready.
// So it's always safe use it as *globalIndexer.Load() and call its methods.
globalIndexer atomic.Pointer[internal.Indexer]
)
func init() {
dummyIndexer := internal.NewDummyIndexer()
globalIndexer.Store(&dummyIndexer)
}
func index(ctx context.Context, indexer internal.Indexer, repoID int64) error {
repo, err := repo_model.GetRepositoryByID(ctx, repoID)
if repo_model.IsErrRepoNotExist(err) {
return indexer.Delete(ctx, repoID)
}
if err != nil {
return err
}
repoTypes := setting.Indexer.RepoIndexerRepoTypes
if len(repoTypes) == 0 {
repoTypes = []string{"sources"}
}
// skip forks from being indexed if unit is not present
if !slices.Contains(repoTypes, "forks") && repo.IsFork {
return nil
}
// skip mirrors from being indexed if unit is not present
if !slices.Contains(repoTypes, "mirrors") && repo.IsMirror {
return nil
}
// skip templates from being indexed if unit is not present
if !slices.Contains(repoTypes, "templates") && repo.IsTemplate {
return nil
}
// skip regular repos from being indexed if unit is not present
if !slices.Contains(repoTypes, "sources") && !repo.IsFork && !repo.IsMirror && !repo.IsTemplate {
return nil
}
sha, err := getDefaultBranchSha(ctx, repo)
if err != nil {
return err
}
changes, err := getRepoChanges(ctx, repo, sha)
if err != nil {
return err
} else if changes == nil {
return nil
}
if err := indexer.Index(ctx, repo, sha, changes); err != nil {
return err
}
return repo_model.UpdateIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode, sha)
}
// Init initialize the repo indexer
func Init() {
if !setting.Indexer.RepoIndexerEnabled {
(*globalIndexer.Load()).Close()
return
}
ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), "Service: CodeIndexer", process.SystemProcessType, false)
graceful.GetManager().RunAtTerminate(func() {
select {
case <-ctx.Done():
return
default:
}
cancel()
log.Debug("Closing repository indexer")
(*globalIndexer.Load()).Close()
log.Info("PID: %d Repository Indexer closed", os.Getpid())
finished()
})
waitChannel := make(chan time.Duration, 1)
// Create the Queue
switch setting.Indexer.RepoType {
case "bleve", "elasticsearch":
handler := func(items ...*internal.IndexerData) (unhandled []*internal.IndexerData) {
indexer := *globalIndexer.Load()
for _, indexerData := range items {
log.Trace("IndexerData Process Repo: %d", indexerData.RepoID)
if err := index(ctx, indexer, indexerData.RepoID); err != nil {
if !setting.IsInTesting {
log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err)
}
}
}
return nil // do not re-queue the failed items, otherwise some broken repo will block the queue
}
indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler)
if indexerQueue == nil {
log.Fatal("Unable to create codes indexer queue")
}
default:
log.Fatal("Unknown codes indexer type; %s", setting.Indexer.RepoType)
}
go func() {
pprof.SetGoroutineLabels(ctx)
start := time.Now()
var (
rIndexer internal.Indexer
existed bool
err error
)
switch setting.Indexer.RepoType {
case "bleve":
log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), setting.Indexer.RepoPath)
defer func() {
if err := recover(); err != nil {
log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2))
log.Error("The indexer files are likely corrupted and may need to be deleted")
log.Error("You can completely remove the \"%s\" directory to make Gitea recreate the indexes", setting.Indexer.RepoPath)
}
}()
rIndexer = bleve.NewIndexer(setting.Indexer.RepoPath)
existed, err = rIndexer.Init(ctx)
if err != nil {
cancel()
(*globalIndexer.Load()).Close()
close(waitChannel)
log.Fatal("PID: %d Unable to initialize the bleve Repository Indexer at path: %s Error: %v", os.Getpid(), setting.Indexer.RepoPath, err)
}
case "elasticsearch":
log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr))
defer func() {
if err := recover(); err != nil {
log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2))
log.Error("The indexer files are likely corrupted and may need to be deleted")
log.Error("You can completely remove the \"%s\" index to make Gitea recreate the indexes", util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr))
}
}()
rIndexer = elasticsearch.NewIndexer(setting.Indexer.RepoConnStr, setting.Indexer.RepoIndexerName)
existed, err = rIndexer.Init(ctx)
if err != nil {
cancel()
(*globalIndexer.Load()).Close()
close(waitChannel)
log.Fatal("PID: %d Unable to initialize the elasticsearch Repository Indexer connstr: %s Error: %v", os.Getpid(), util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr), err)
}
default:
log.Fatal("PID: %d Unknown Indexer type: %s", os.Getpid(), setting.Indexer.RepoType)
}
globalIndexer.Store(&rIndexer)
// Start processing the queue
go graceful.GetManager().RunWithCancel(indexerQueue)
if !existed { // populate the index because it's created for the first time
go graceful.GetManager().RunWithShutdownContext(populateRepoIndexer)
}
select {
case waitChannel <- time.Since(start):
case <-graceful.GetManager().IsShutdown():
}
close(waitChannel)
}()
if setting.Indexer.StartupTimeout > 0 {
go func() {
pprof.SetGoroutineLabels(ctx)
timeout := setting.Indexer.StartupTimeout
if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 {
timeout += setting.GracefulHammerTime
}
select {
case <-graceful.GetManager().IsShutdown():
log.Warn("Shutdown before Repository Indexer completed initialization")
cancel()
(*globalIndexer.Load()).Close()
case duration, ok := <-waitChannel:
if !ok {
log.Warn("Repository Indexer Initialization failed")
cancel()
(*globalIndexer.Load()).Close()
return
}
log.Info("Repository Indexer Initialization took %v", duration)
case <-time.After(timeout):
cancel()
(*globalIndexer.Load()).Close()
log.Fatal("Repository Indexer Initialization Timed-Out after: %v", timeout)
}
}()
}
}
// UpdateRepoIndexer update a repository's entries in the indexer
func UpdateRepoIndexer(repo *repo_model.Repository) {
indexData := &internal.IndexerData{RepoID: repo.ID}
if err := indexerQueue.Push(indexData); err != nil {
log.Error("Update repo index data %v failed: %v", indexData, err)
}
}
// IsAvailable checks if issue indexer is available
func IsAvailable(ctx context.Context) bool {
return (*globalIndexer.Load()).Ping(ctx) == nil
}
// populateRepoIndexer populate the repo indexer with pre-existing data. This
// should only be run when the indexer is created for the first time.
func populateRepoIndexer(ctx context.Context) {
log.Info("Populating the repo indexer with existing repositories")
exist, err := db.IsTableNotEmpty("repository")
if err != nil {
log.Fatal("System error: %v", err)
} else if !exist {
return
}
// if there is any existing repo indexer metadata in the DB, delete it
// since we are starting afresh. Also, xorm requires deletes to have a
// condition, and we want to delete everything, thus 1=1.
if err := db.DeleteAllRecords("repo_indexer_status"); err != nil {
log.Fatal("System error: %v", err)
}
var maxRepoID int64
if maxRepoID, err = db.GetMaxID("repository"); err != nil {
log.Fatal("System error: %v", err)
}
// start with the maximum existing repo ID and work backwards, so that we
// don't include repos that are created after gitea starts; such repos will
// already be added to the indexer, and we don't need to add them again.
for maxRepoID > 0 {
select {
case <-ctx.Done():
log.Info("Repository Indexer population shutdown before completion")
return
default:
}
ids, err := repo_model.GetUnindexedRepos(ctx, repo_model.RepoIndexerTypeCode, maxRepoID, 0, 50)
if err != nil {
log.Error("populateRepoIndexer: %v", err)
return
} else if len(ids) == 0 {
break
}
for _, id := range ids {
select {
case <-ctx.Done():
log.Info("Repository Indexer population shutdown before completion")
return
default:
}
if err := indexerQueue.Push(&internal.IndexerData{RepoID: id}); err != nil {
log.Error("indexerQueue.Push: %v", err)
return
}
maxRepoID = id - 1
}
}
log.Info("Done (re)populating the repo indexer with existing repositories")
}
func SupportedSearchModes() []indexer.SearchMode {
gi := globalIndexer.Load()
if gi == nil {
return nil
}
return (*gi).SupportedSearchModes()
}
+352
View File
@@ -0,0 +1,352 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package code
import (
"context"
"os"
"slices"
"testing"
"time"
"gitea.dev/models/db"
"gitea.dev/models/unittest"
indexer_module "gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/code/bleve"
"gitea.dev/modules/indexer/code/elasticsearch"
"gitea.dev/modules/indexer/code/internal"
"gitea.dev/modules/setting"
"gitea.dev/modules/test"
"gitea.dev/modules/util"
_ "gitea.dev/models"
_ "gitea.dev/models/actions"
_ "gitea.dev/models/activities"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type codeSearchResult struct {
Filename string
Content string
}
func TestMain(m *testing.M) {
unittest.MainTest(m)
}
func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
t.Run(name, func(t *testing.T) {
assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer))
// Wait for the index to catch up: ES/OpenSearch make writes visible
// only after a refresh (default interval: 1s). Bleve is synchronous
// and passes on the first iteration.
require.Eventually(t, func() bool {
total, _, _, err := indexer.Search(t.Context(), &internal.SearchOptions{
Keyword: "Description",
Paginator: &db.ListOptions{Page: 1, PageSize: 1},
})
return err == nil && total > 0
}, 10*time.Second, 100*time.Millisecond, "index did not become searchable")
keywords := []struct {
RepoIDs []int64
Keyword string
Langs int
SearchMode indexer_module.SearchModeType
Results []codeSearchResult
}{
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "Description",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the contents of a file within the repo '2'.
// This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "Description",
Langs: 0,
},
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "repo1",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the contents of a file within the repo '2'.
// This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "repo1",
Langs: 0,
},
// Search for a non-existing term.
// This scenario yields no results
{
RepoIDs: nil,
Keyword: "non-exist",
Langs: 0,
},
// Search for an exact match on the contents of a file within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "pineaple",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for an exact match on the filename within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "avocado.md",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for an partial match on the filename within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "avo",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on both the contents and the filenames within the repo '62'.
// This scenario yields two results: the first result is based on the file (cucumber.md) while the second is based on the contents
{
RepoIDs: []int64{62},
Keyword: "cucumber",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "cucumber.md",
Content: "Salad is good for your health",
},
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on the filenames within the repo '62'.
// This scenario yields two results (both are based on filename, the first one is an exact match)
{
RepoIDs: []int64{62},
Keyword: "ham",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "ham.md",
Content: "This is also not cheese",
},
{
Filename: "potato/ham.md",
Content: "This is not cheese",
},
},
},
// Search for matches on the contents of files within the repo '62'.
// This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
{
RepoIDs: []int64{62},
Keyword: "This is not cheese",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "potato/ham.md",
Content: "This is not cheese",
},
{
Filename: "ham.md",
Content: "This is also not cheese",
},
},
},
// Search for matches on the contents of files regardless of case.
{
RepoIDs: nil,
Keyword: "dESCRIPTION",
Langs: 1,
SearchMode: indexer_module.SearchModeFuzzy,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the filename within the repo '62' (case-insensitive).
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "AVOCADO.MD",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on the contents of files when the criteria are an expression.
{
RepoIDs: []int64{62},
Keyword: "console.log",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "example-file.js",
Content: "console.log(\"Hello, World!\")",
},
},
},
// Search for matches on the contents of files when the criteria are parts of an expression.
{
RepoIDs: []int64{62},
Keyword: "log",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "example-file.js",
Content: "console.log(\"Hello, World!\")",
},
},
},
}
for _, kw := range keywords {
t.Run(kw.Keyword, func(t *testing.T) {
total, res, langs, err := indexer.Search(t.Context(), &internal.SearchOptions{
RepoIDs: kw.RepoIDs,
Keyword: kw.Keyword,
SearchMode: util.IfZero(kw.SearchMode, indexer_module.SearchModeWords),
Paginator: &db.ListOptions{
Page: 1,
PageSize: 10,
},
})
require.NoError(t, err)
require.Len(t, langs, kw.Langs)
hits := make([]codeSearchResult, 0, len(res))
if total > 0 {
assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
}
for _, hit := range res {
hits = append(hits, codeSearchResult{
Filename: hit.Filename,
Content: hit.Content,
})
}
lastIndex := -1
for _, expected := range kw.Results {
index := slices.Index(hits, expected)
if index == -1 {
assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
} else if lastIndex > index {
assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
} else {
lastIndex = index
}
}
})
}
assert.NoError(t, tearDownRepositoryIndexes(t.Context(), indexer))
})
}
func TestBleveIndexAndSearch(t *testing.T) {
unittest.PrepareTestEnv(t)
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
dir := t.TempDir()
idx := bleve.NewIndexer(dir)
defer idx.Close()
_, err := idx.Init(t.Context())
require.NoError(t, err)
testIndexer("bleve", t, idx)
}
func TestESIndexAndSearch(t *testing.T) {
unittest.PrepareTestEnv(t)
u := os.Getenv("TEST_INDEXER_CODE_ES_URL")
if u == "" {
t.SkipNow()
return
}
indexer := elasticsearch.NewIndexer(u, "gitea_codes")
if _, err := indexer.Init(t.Context()); err != nil {
if indexer != nil {
indexer.Close()
}
require.NoError(t, err, "Unable to init ES indexer")
}
defer indexer.Close()
testIndexer("elastic_search", t, indexer)
}
func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
for _, repoID := range repositoriesToSearch() {
if err := index(ctx, indexer, repoID); err != nil {
return err
}
}
return nil
}
func tearDownRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
for _, repoID := range repositoriesToSearch() {
if err := indexer.Delete(ctx, repoID); err != nil {
return err
}
}
return nil
}
func repositoriesToSearch() []int64 {
return []int64{1, 62}
}
+60
View File
@@ -0,0 +1,60 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"context"
"errors"
"gitea.dev/models/db"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/internal"
)
// Indexer defines an interface to index and search code contents
type Indexer interface {
internal.Indexer
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
Delete(ctx context.Context, repoID int64) error
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
SupportedSearchModes() []indexer.SearchMode
}
type SearchOptions struct {
RepoIDs []int64
Keyword string
Language string
SearchMode indexer.SearchModeType
db.Paginator
}
// NewDummyIndexer returns a dummy indexer
func NewDummyIndexer() Indexer {
return &dummyIndexer{
Indexer: internal.NewDummyIndexer(),
}
}
type dummyIndexer struct {
internal.Indexer
}
func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode {
return nil
}
func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error {
return errors.New("indexer is not ready")
}
func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
return errors.New("indexer is not ready")
}
func (d *dummyIndexer) Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) {
return 0, nil, nil, errors.New("indexer is not ready")
}
+44
View File
@@ -0,0 +1,44 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import "gitea.dev/modules/timeutil"
type FileUpdate struct {
Filename string
BlobSha string
Size int64
Sized bool
}
// RepoChanges changes (file additions/updates/removals) to a repo
type RepoChanges struct {
Updates []FileUpdate
RemovedFilenames []string
}
// IndexerData represents data stored in the code indexer
type IndexerData struct {
RepoID int64
}
// SearchResult result of performing a search in a repo
type SearchResult struct {
RepoID int64
StartIndex int
EndIndex int
Filename string
Content string
CommitID string
UpdatedUnix timeutil.TimeStamp
Language string
Color string
}
// SearchResultLanguages result of top languages count in search results
type SearchResultLanguages struct {
Language string
Color string
Count int
}
+48
View File
@@ -0,0 +1,48 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"strings"
"gitea.dev/modules/indexer/internal"
"gitea.dev/modules/log"
)
const filenameMatchNumberOfLines = 7 // Copied from GitHub search
func FilenameIndexerID(repoID int64, filename string) string {
return internal.Base36(repoID) + "_" + filename
}
func ParseIndexerID(indexerID string) (int64, string) {
before, after, ok := strings.Cut(indexerID, "_")
if !ok {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
repoID, _ := internal.ParseBase36(before)
return repoID, after
}
func FilenameOfIndexerID(indexerID string) string {
_, after, ok := strings.Cut(indexerID, "_")
if !ok {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
return after
}
// FilenameMatchIndexPos returns the boundaries of its first seven lines.
func FilenameMatchIndexPos(content string) (int, int) {
count := 1
for i, c := range content {
if c == '\n' {
count++
if count == filenameMatchNumberOfLines {
return 0, i
}
}
}
return 0, len(content)
}
+153
View File
@@ -0,0 +1,153 @@
// Copyright 2017 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package code
import (
"bytes"
"context"
"html/template"
"strings"
"gitea.dev/modules/highlight"
"gitea.dev/modules/indexer/code/internal"
"gitea.dev/modules/timeutil"
)
// Result a search result to display
type Result struct {
RepoID int64
Filename string
CommitID string
UpdatedUnix timeutil.TimeStamp
Language string
Color string
Lines []*ResultLine
}
type ResultLine struct {
Num int
FormattedContent template.HTML
}
type SearchResultLanguages = internal.SearchResultLanguages
type SearchOptions = internal.SearchOptions
func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) {
startIndex := selectionStartIndex
numLinesBefore := 0
for ; startIndex > 0; startIndex-- {
if content[startIndex-1] == '\n' {
if numLinesBefore == 1 {
break
}
numLinesBefore++
}
}
endIndex := selectionEndIndex
numLinesAfter := 0
for ; endIndex < len(content); endIndex++ {
if content[endIndex] == '\n' {
if numLinesAfter == 1 {
break
}
numLinesAfter++
}
}
return startIndex, endIndex
}
func writeStrings(buf *bytes.Buffer, strs ...string) error {
for _, s := range strs {
_, err := buf.WriteString(s)
if err != nil {
return err
}
}
return nil
}
func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine {
// we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting
lexer := highlight.DetectChromaLexerByFileName(filename, language)
hl := highlight.RenderCodeByLexer(lexer, code)
highlightedLines := highlight.UnsafeSplitHighlightedLines(hl)
// The lineNums outputted by render might not match the original lineNums, because "highlight" removes the last `\n`
lines := make([]*ResultLine, min(len(highlightedLines), len(lineNums)))
for i := range lines {
lines[i] = &ResultLine{
Num: lineNums[i],
FormattedContent: template.HTML(highlightedLines[i]),
}
}
return lines
}
func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Result, error) {
startLineNum := 1 + strings.Count(result.Content[:startIndex], "\n")
var formattedLinesBuffer bytes.Buffer
contentLines := strings.SplitAfter(result.Content[startIndex:endIndex], "\n")
lineNums := make([]int, 0, len(contentLines))
index := startIndex
for i, line := range contentLines {
var err error
if index < result.EndIndex &&
result.StartIndex < index+len(line) &&
result.StartIndex < result.EndIndex {
openActiveIndex := max(result.StartIndex-index, 0)
closeActiveIndex := min(result.EndIndex-index, len(line))
err = writeStrings(&formattedLinesBuffer,
line[:openActiveIndex],
line[openActiveIndex:closeActiveIndex],
line[closeActiveIndex:],
)
} else {
err = writeStrings(&formattedLinesBuffer, line)
}
if err != nil {
return nil, err
}
lineNums = append(lineNums, startLineNum+i)
index += len(line)
}
return &Result{
RepoID: result.RepoID,
Filename: result.Filename,
CommitID: result.CommitID,
UpdatedUnix: result.UpdatedUnix,
Language: result.Language,
Color: result.Color,
Lines: HighlightSearchResultCode(result.Filename, result.Language, lineNums, formattedLinesBuffer.String()),
}, nil
}
// PerformSearch perform a search on a repository
func PerformSearch(ctx context.Context, opts *SearchOptions) (int64, []*Result, []*SearchResultLanguages, error) {
if opts == nil || len(opts.Keyword) == 0 {
return 0, nil, nil, nil
}
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, opts)
if err != nil {
return 0, nil, nil, err
}
displayResults := make([]*Result, len(results))
for i, result := range results {
startIndex, endIndex := indices(result.Content, result.StartIndex, result.EndIndex)
displayResults[i], err = searchResult(result, startIndex, endIndex)
if err != nil {
return 0, nil, nil, err
}
}
return total, displayResults, resultLanguages, nil
}
+54
View File
@@ -0,0 +1,54 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package indexer
type SearchModeType string
const (
SearchModeExact SearchModeType = "exact"
SearchModeWords SearchModeType = "words"
SearchModeFuzzy SearchModeType = "fuzzy"
SearchModeRegexp SearchModeType = "regexp"
)
type SearchMode struct {
ModeValue SearchModeType
TooltipTrKey string
TitleTrKey string
}
func SearchModesExactWords() []SearchMode {
return []SearchMode{
{
ModeValue: SearchModeExact,
TooltipTrKey: "search.exact_tooltip",
TitleTrKey: "search.exact",
},
{
ModeValue: SearchModeWords,
TooltipTrKey: "search.words_tooltip",
TitleTrKey: "search.words",
},
}
}
func SearchModesExactWordsFuzzy() []SearchMode {
return append(SearchModesExactWords(), []SearchMode{
{
ModeValue: SearchModeFuzzy,
TooltipTrKey: "search.fuzzy_tooltip",
TitleTrKey: "search.fuzzy",
},
}...)
}
func GitGrepSupportedSearchModes() []SearchMode {
return append(SearchModesExactWords(), []SearchMode{
{
ModeValue: SearchModeRegexp,
TooltipTrKey: "search.regexp_tooltip",
TitleTrKey: "search.regexp",
},
}...)
}
+21
View File
@@ -0,0 +1,21 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"fmt"
"strconv"
)
func Base36(i int64) string {
return strconv.FormatInt(i, 36)
}
func ParseBase36(s string) (int64, error) {
i, err := strconv.ParseInt(s, 36, 64)
if err != nil {
return 0, fmt.Errorf("invalid base36 integer %q: %w", s, err)
}
return i, nil
}
+58
View File
@@ -0,0 +1,58 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"github.com/blevesearch/bleve/v2"
)
// FlushingBatch is a batch of operations that automatically flushes to the
// underlying index once it reaches a certain size.
type FlushingBatch struct {
maxBatchSize int
batch *bleve.Batch
index bleve.Index
}
// NewFlushingBatch creates a new flushing batch for the specified index. Once
// the number of operations in the batch reaches the specified limit, the batch
// automatically flushes its operations to the index.
func NewFlushingBatch(index bleve.Index, maxBatchSize int) *FlushingBatch {
return &FlushingBatch{
maxBatchSize: maxBatchSize,
batch: index.NewBatch(),
index: index,
}
}
// Index add a new index to batch
func (b *FlushingBatch) Index(id string, data any) error {
if err := b.batch.Index(id, data); err != nil {
return err
}
return b.flushIfFull()
}
// Delete add a delete index to batch
func (b *FlushingBatch) Delete(id string) error {
b.batch.Delete(id)
return b.flushIfFull()
}
func (b *FlushingBatch) flushIfFull() error {
if b.batch.Size() < b.maxBatchSize {
return nil
}
return b.Flush()
}
// Flush submit the batch and create a new one
func (b *FlushingBatch) Flush() error {
err := b.index.Batch(b.batch)
if err != nil {
return err
}
b.batch = b.index.NewBatch()
return nil
}
+103
View File
@@ -0,0 +1,103 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"context"
"errors"
"gitea.dev/modules/indexer/internal"
"gitea.dev/modules/log"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/ethantkoenig/rupture"
)
var _ internal.Indexer = &Indexer{}
// Indexer represents a basic bleve indexer implementation
type Indexer struct {
Indexer bleve.Index
indexDir string
version int
mappingGetter MappingGetter
}
type MappingGetter func() (mapping.IndexMapping, error)
func NewIndexer(indexDir string, version int, mappingGetter func() (mapping.IndexMapping, error)) *Indexer {
return &Indexer{
indexDir: indexDir,
version: version,
mappingGetter: mappingGetter,
}
}
// Init initializes the indexer
func (i *Indexer) Init(_ context.Context) (bool, error) {
if i == nil {
return false, errors.New("cannot init nil indexer")
}
if i.Indexer != nil {
return false, errors.New("indexer is already initialized")
}
indexer, version, err := openIndexer(i.indexDir, i.version)
if err != nil {
return false, err
}
if indexer != nil {
i.Indexer = indexer
return true, nil
}
if version != 0 {
log.Warn("Found older bleve index with version %d, Gitea will remove it and rebuild", version)
}
indexMapping, err := i.mappingGetter()
if err != nil {
return false, err
}
indexer, err = bleve.New(i.indexDir, indexMapping)
if err != nil {
return false, err
}
if err = rupture.WriteIndexMetadata(i.indexDir, &rupture.IndexMetadata{
Version: i.version,
}); err != nil {
return false, err
}
i.Indexer = indexer
return false, nil
}
// Ping checks if the indexer is available
func (i *Indexer) Ping(_ context.Context) error {
if i == nil {
return errors.New("cannot ping nil indexer")
}
if i.Indexer == nil {
return errors.New("indexer is not initialized")
}
return nil
}
func (i *Indexer) Close() {
if i == nil || i.Indexer == nil {
return
}
if err := i.Indexer.Close(); err != nil {
log.Error("Failed to close bleve indexer in %q: %v", i.indexDir, err)
}
i.Indexer = nil
}
+66
View File
@@ -0,0 +1,66 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"gitea.dev/modules/optional"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/search/query"
)
// NumericEqualityQuery generates a numeric equality query for the given value and field
func NumericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
f := float64(value)
tru := true
q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
q.SetField(field)
return q
}
// MatchPhraseQuery generates a match phrase query for the given phrase, field and analyzer
func MatchPhraseQuery(matchPhrase, field, analyzer string, fuzziness int) *query.MatchPhraseQuery {
q := bleve.NewMatchPhraseQuery(matchPhrase)
q.FieldVal = field
q.Analyzer = analyzer
q.Fuzziness = fuzziness
return q
}
// MatchAndQuery generates a match query for the given phrase, field and analyzer
func MatchAndQuery(matchPhrase, field, analyzer string, fuzziness int) *query.MatchQuery {
q := bleve.NewMatchQuery(matchPhrase)
q.FieldVal = field
q.Analyzer = analyzer
q.Fuzziness = fuzziness
q.Operator = query.MatchQueryOperatorAnd
return q
}
// BoolFieldQuery generates a bool field query for the given value and field
func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
q := bleve.NewBoolFieldQuery(value)
q.SetField(field)
return q
}
func NumericRangeInclusiveQuery(minOption, maxOption optional.Option[int64], field string) *query.NumericRangeQuery {
var minF, maxF *float64
var minI, maxI *bool
if minOption.Has() {
minF = new(float64)
*minF = float64(minOption.Value())
minI = new(bool)
*minI = true
}
if maxOption.Has() {
maxF = new(float64)
*maxF = float64(maxOption.Value())
maxI = new(bool)
*maxI = true
}
q := bleve.NewNumericRangeInclusiveQuery(minF, maxF, minI, maxI)
q.SetField(field)
return q
}
+90
View File
@@ -0,0 +1,90 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"errors"
"os"
"unicode"
"gitea.dev/modules/log"
"gitea.dev/modules/setting"
"gitea.dev/modules/util"
"github.com/blevesearch/bleve/v2"
unicode_tokenizer "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/index/upsidedown"
"github.com/ethantkoenig/rupture"
)
const (
maxFuzziness = 2
)
// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
_, err := os.Stat(path)
if err != nil && os.IsNotExist(err) {
return nil, 0, nil
} else if err != nil {
return nil, 0, err
}
metadata, err := rupture.ReadIndexMetadata(path)
if err != nil {
return nil, 0, err
}
if metadata.Version < latestVersion {
// the indexer is using a previous version, so we should delete it and
// re-populate
return nil, metadata.Version, util.RemoveAll(path)
}
index, err := bleve.Open(path)
if err != nil {
if errors.Is(err, upsidedown.IncompatibleVersion) {
log.Warn("Indexer was built with a previous version of bleve, deleting and rebuilding")
return nil, 0, util.RemoveAll(path)
}
return nil, 0, err
}
return index, 0, nil
}
// GuessFuzzinessByKeyword guesses fuzziness based on the levenshtein distance and determines how many chars
// may be different on two string, and they still be considered equivalent.
// Given a phrase, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int {
tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
tokens := tokenizer.Tokenize([]byte(s))
if len(tokens) > 0 {
fuzziness := maxFuzziness
for _, token := range tokens {
fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
}
return fuzziness
}
return 0
}
func guessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
for _, r := range s {
if r >= 128 || !unicode.IsLetter(r) {
return 0
}
}
return min(min(setting.Indexer.TypeBleveMaxFuzzniess, maxFuzziness), len(s)/4)
}
@@ -0,0 +1,58 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"fmt"
"testing"
"gitea.dev/modules/setting"
"gitea.dev/modules/test"
"github.com/stretchr/testify/assert"
)
func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
scenarios := []struct {
Input string
Fuzziness int // See util.go for the definition of fuzziness in this particular context
}{
{
Input: "",
Fuzziness: 0,
},
{
Input: "Avocado",
Fuzziness: 1,
},
{
Input: "Geschwindigkeit",
Fuzziness: 2,
},
{
Input: "non-exist",
Fuzziness: 0,
},
{
Input: "갃갃갃",
Fuzziness: 0,
},
{
Input: "repo1",
Fuzziness: 0,
},
{
Input: "avocado.md",
Fuzziness: 0,
},
}
for _, scenario := range scenarios {
t.Run(fmt.Sprintf("Fuziniess:%s=%d", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
})
}
}
+34
View File
@@ -0,0 +1,34 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package db
import (
"context"
"gitea.dev/modules/indexer/internal"
)
var _ internal.Indexer = &Indexer{}
// Indexer represents a basic db indexer implementation
type Indexer struct{}
// Init initializes the indexer
func (i *Indexer) Init(_ context.Context) (bool, error) {
// Return true to indicate that the index was opened/existed.
// So that the indexer will not try to populate the index, the data is already there.
return true, nil
}
// Ping checks if the indexer is available
func (i *Indexer) Ping(_ context.Context) error {
// No need to ping database to check if it is available.
// If the database goes down, Gitea will go down, so nobody will care if the indexer is available.
return nil
}
// Close closes the indexer
func (i *Indexer) Close() {
// nothing to do
}
@@ -0,0 +1,409 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"bytes"
"context"
"fmt"
"io"
"net"
"net/http"
"net/url"
"slices"
"strconv"
"strings"
"time"
"gitea.dev/modules/indexer/internal"
"gitea.dev/modules/json"
)
var _ internal.Indexer = &Indexer{}
// Indexer is a narrow wrapper around an Elasticsearch/OpenSearch cluster.
// It targets the REST subset shared by Elasticsearch 7/8/9 and OpenSearch 3.
type Indexer struct {
client *http.Client
base string // base URL with trailing slash, no userinfo
user string
pass string
indexName string
version int
mapping string
}
// NewIndexer builds an Indexer. The connection is opened by Init.
func NewIndexer(rawURL, indexName string, version int, mapping string) *Indexer {
return &Indexer{
base: rawURL,
indexName: indexName,
version: version,
mapping: mapping,
}
}
// Init connects and creates the versioned index if missing, returning true if it already existed.
func (i *Indexer) Init(ctx context.Context) (bool, error) {
parsed, err := url.Parse(i.base)
if err != nil {
return false, fmt.Errorf("parse elasticsearch url: %w", err)
}
if parsed.User != nil {
i.user = parsed.User.Username()
i.pass, _ = parsed.User.Password()
parsed.User = nil
}
base := parsed.String()
if !strings.HasSuffix(base, "/") {
base += "/"
}
i.base = base
// No client-level Timeout: bulk/_delete_by_query can legitimately run for
// minutes on large repos. Per-request deadlines come from the caller's ctx;
// transport-level timeouts cover stalled connects/handshakes/headers so a
// half-open server cannot wedge the indexer indefinitely.
i.client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{Timeout: 30 * time.Second, KeepAlive: 30 * time.Second}).DialContext,
TLSHandshakeTimeout: 10 * time.Second,
ResponseHeaderTimeout: 30 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
IdleConnTimeout: 90 * time.Second,
MaxIdleConns: 100,
},
}
exists, err := i.indexExists(ctx, i.VersionedIndexName())
if err != nil {
return false, err
}
if exists {
return true, nil
}
if err := i.createIndex(ctx); err != nil {
return false, err
}
return false, nil
}
// Ping returns an error when the cluster is unusable (status != green/yellow).
func (i *Indexer) Ping(ctx context.Context) error {
var body struct {
Status string `json:"status"`
}
if err := i.doJSON(ctx, http.MethodGet, "_cluster/health", nil, &body); err != nil {
return err
}
// Healthy = green; usable = yellow. Red is unusable.
// https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-health.html
if body.Status != "green" && body.Status != "yellow" {
return fmt.Errorf("status of elasticsearch cluster is %s", body.Status)
}
return nil
}
// Close releases idle HTTP connections held by the client.
func (i *Indexer) Close() {
if i == nil || i.client == nil {
return
}
i.client.CloseIdleConnections()
i.client = nil
}
// Bulk submits index/delete ops. Returns the first item-level failure, if any.
func (i *Indexer) Bulk(ctx context.Context, ops []BulkOp) error {
if len(ops) == 0 {
return nil
}
index := i.VersionedIndexName()
var buf bytes.Buffer
buf.Grow(len(ops) * 256)
for _, op := range ops {
meta := map[string]any{op.action: map[string]any{"_index": index, "_id": op.id}}
if err := writeJSONLine(&buf, meta); err != nil {
return err
}
if op.action == bulkActionIndex {
if err := writeJSONLine(&buf, op.doc); err != nil {
return err
}
}
}
res, err := i.do(ctx, http.MethodPost, urlPath(index, "_bulk"), "application/x-ndjson", bytes.NewReader(buf.Bytes()))
if err != nil {
return err
}
defer drainAndClose(res)
var body struct {
Errors bool `json:"errors"`
Items []map[string]struct {
Status int `json:"status"`
Error json.Value `json:"error"`
} `json:"items"`
}
if err := json.NewDecoder(res.Body).Decode(&body); err != nil {
return err
}
if !body.Errors {
return nil
}
return firstBulkError(body.Items)
}
// firstBulkError returns the first item-level failure in a bulk response.
// Each items entry is a single-key map ({"index": {...}} or {"delete": {...}}).
// Delete-of-missing (404) is idempotent and not reported.
func firstBulkError(items []map[string]struct {
Status int `json:"status"`
Error json.Value `json:"error"`
},
) error {
for _, item := range items {
for action, result := range item {
if action == bulkActionDelete && result.Status == http.StatusNotFound {
continue
}
if result.Status >= 300 {
return fmt.Errorf("bulk %s failed (status %d): %s", action, result.Status, string(result.Error))
}
}
}
return nil
}
// Index writes a single document.
func (i *Indexer) Index(ctx context.Context, id string, doc any) error {
body, err := json.Marshal(doc)
if err != nil {
return err
}
return i.doJSON(ctx, http.MethodPut, urlPath(i.VersionedIndexName(), "_doc", id), bytes.NewReader(body), nil)
}
// Delete removes a single document by id. Missing ids are not an error.
func (i *Indexer) Delete(ctx context.Context, id string) error {
res, err := i.do(ctx, http.MethodDelete, urlPath(i.VersionedIndexName(), "_doc", id), "", nil, http.StatusNotFound)
if err != nil {
return err
}
drainAndClose(res)
return nil
}
// DeleteByQuery removes every document matching the query.
func (i *Indexer) DeleteByQuery(ctx context.Context, query Query) error {
body, err := json.Marshal(map[string]any{"query": query.querySource()})
if err != nil {
return err
}
return i.doJSON(ctx, http.MethodPost, urlPath(i.VersionedIndexName(), "_delete_by_query"), bytes.NewReader(body), nil)
}
// Refresh forces a refresh so recent writes are searchable.
func (i *Indexer) Refresh(ctx context.Context) error {
return i.doJSON(ctx, http.MethodPost, urlPath(i.VersionedIndexName(), "_refresh"), nil, nil)
}
// Search runs a search request and decodes the reply.
func (i *Indexer) Search(ctx context.Context, req SearchRequest) (*SearchResponse, error) {
body := map[string]any{}
if req.Query != nil {
body["query"] = req.Query.querySource()
}
if len(req.Sort) > 0 {
sorts := make([]map[string]any, len(req.Sort))
for idx, s := range req.Sort {
sorts[idx] = s.source()
}
body["sort"] = sorts
}
if req.From > 0 {
body["from"] = req.From
}
body["size"] = req.Size
if len(req.Aggregations) > 0 {
body["aggs"] = req.Aggregations
}
if len(req.Highlight) > 0 {
body["highlight"] = req.Highlight
}
payload, err := json.Marshal(body)
if err != nil {
return nil, err
}
// Default track_total_hits is 10000 (capped count); send it explicitly so
// callers can choose between exact totals (true) and skipping counting (false).
path := urlPath(i.VersionedIndexName(), "_search") + "?track_total_hits=" + strconv.FormatBool(req.TrackTotal)
res, err := i.do(ctx, http.MethodPost, path, "application/json", bytes.NewReader(payload))
if err != nil {
return nil, err
}
defer drainAndClose(res)
return decodeSearchResponse(res.Body)
}
func (i *Indexer) indexExists(ctx context.Context, name string) (bool, error) {
res, err := i.do(ctx, http.MethodHead, urlPath(name), "", nil, http.StatusNotFound)
if err != nil {
return false, err
}
drainAndClose(res)
return res.StatusCode == http.StatusOK, nil
}
func (i *Indexer) createIndex(ctx context.Context) error {
var body struct {
Acknowledged bool `json:"acknowledged"`
}
if err := i.doJSON(ctx, http.MethodPut, urlPath(i.VersionedIndexName()), bytes.NewBufferString(i.mapping), &body); err != nil {
return fmt.Errorf("create index %s: %w", i.VersionedIndexName(), err)
}
if !body.Acknowledged {
return fmt.Errorf("create index %s not acknowledged", i.VersionedIndexName())
}
i.checkOldIndexes(ctx)
return nil
}
// do sends a request and returns the response. Status >= 300 is turned into
// an error unless the status appears in okStatus. The caller closes Body.
func (i *Indexer) do(ctx context.Context, method, path, contentType string, body io.Reader, okStatus ...int) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, method, i.base+path, body)
if err != nil {
return nil, err
}
if contentType != "" {
req.Header.Set("Content-Type", contentType)
}
if i.user != "" || i.pass != "" {
req.SetBasicAuth(i.user, i.pass)
}
res, err := i.client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode >= 300 && !slices.Contains(okStatus, res.StatusCode) {
msg := readErrBody(res)
res.Body.Close()
return nil, fmt.Errorf("%s %s: %s", method, path, msg)
}
return res, nil
}
// doJSON sends a request with a JSON body and, when out is non-nil, decodes
// the JSON response into it.
func (i *Indexer) doJSON(ctx context.Context, method, path string, body io.Reader, out any) error {
contentType := ""
if body != nil {
contentType = "application/json"
}
res, err := i.do(ctx, method, path, contentType, body)
if err != nil {
return err
}
defer drainAndClose(res)
if out == nil {
return nil
}
return json.NewDecoder(res.Body).Decode(out)
}
// drainAndClose discards any unread response body before closing so the
// underlying TCP connection can be reused for keep-alive.
func drainAndClose(res *http.Response) {
_, _ = io.Copy(io.Discard, res.Body)
res.Body.Close()
}
func writeJSONLine(buf *bytes.Buffer, v any) error {
enc, err := json.Marshal(v)
if err != nil {
return err
}
buf.Write(enc)
buf.WriteByte('\n')
return nil
}
// readErrBody reads up to 4 KiB of an error response and drains the rest so
// the underlying connection can be reused (keep-alive needs Body fully read).
func readErrBody(res *http.Response) string {
const limit = 4 << 10
b, _ := io.ReadAll(io.LimitReader(res.Body, limit))
_, _ = io.Copy(io.Discard, res.Body)
return fmt.Sprintf("status %d: %s", res.StatusCode, bytes.TrimSpace(b))
}
func decodeSearchResponse(r io.Reader) (*SearchResponse, error) {
var raw struct {
Hits struct {
Total struct {
Value int64 `json:"value"`
} `json:"total"`
Hits []struct {
ID string `json:"_id"`
Score float64 `json:"_score"`
Source json.Value `json:"_source"`
Highlight map[string][]string `json:"highlight"`
} `json:"hits"`
} `json:"hits"`
Aggregations map[string]struct {
Buckets []struct {
Key any `json:"key"`
DocCount int64 `json:"doc_count"`
} `json:"buckets"`
} `json:"aggregations"`
}
if err := json.NewDecoder(r).Decode(&raw); err != nil {
return nil, err
}
resp := &SearchResponse{
Total: raw.Hits.Total.Value,
Hits: make([]SearchHit, 0, len(raw.Hits.Hits)),
}
for _, h := range raw.Hits.Hits {
resp.Hits = append(resp.Hits, SearchHit{
ID: h.ID,
Score: h.Score,
Source: h.Source,
Highlight: h.Highlight,
})
}
if len(raw.Aggregations) > 0 {
resp.Aggregations = make(map[string][]AggBucket, len(raw.Aggregations))
for name, agg := range raw.Aggregations {
buckets := make([]AggBucket, len(agg.Buckets))
for idx, b := range agg.Buckets {
buckets[idx] = AggBucket{Key: b.Key, DocCount: b.DocCount}
}
resp.Aggregations[name] = buckets
}
}
return resp, nil
}
// urlPath joins path segments with `/` and percent-escapes each.
func urlPath(segments ...string) string {
var b bytes.Buffer
for idx, s := range segments {
if idx > 0 {
b.WriteByte('/')
}
b.WriteString(url.PathEscape(s))
}
return b.String()
}
@@ -0,0 +1,39 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"strings"
"testing"
"gitea.dev/modules/test"
"github.com/stretchr/testify/require"
)
func newRealIndexer(t *testing.T) *Indexer {
t.Helper()
esURL := test.ExternalServiceHTTP(t, "TEST_ELASTICSEARCH_URL", "http://elasticsearch:9200")
indexName := "gitea_test_" + strings.ReplaceAll(strings.ToLower(t.Name()), "/", "_")
ix := NewIndexer(esURL, indexName, 1, `{"mappings":{"properties":{"x":{"type":"keyword"}}}}`)
_, err := ix.Init(t.Context())
require.NoError(t, err)
t.Cleanup(ix.Close)
return ix
}
func TestPing(t *testing.T) {
ix := newRealIndexer(t)
require.NoError(t, ix.Ping(t.Context()))
}
func TestDeleteSwallows404(t *testing.T) {
ix := newRealIndexer(t)
require.NoError(t, ix.Delete(t.Context(), "missing-id"))
}
func TestBulkAcceptsDelete404(t *testing.T) {
ix := newRealIndexer(t)
require.NoError(t, ix.Bulk(t.Context(), []BulkOp{DeleteOp("missing-id")}))
}
@@ -0,0 +1,132 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
// MultiMatch types used by the call sites. See
// https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
const (
MultiMatchTypeBestFields = "best_fields"
MultiMatchTypePhrasePrefix = "phrase_prefix"
)
// ToAnySlice converts []T to []any for variadic query args like TermsQuery.
func ToAnySlice[T any](s []T) []any {
out := make([]any, len(s))
for idx, v := range s {
out[idx] = v
}
return out
}
// Query is an Elasticsearch query DSL node. It marshals to the JSON
// object expected by the ES query API.
type Query interface {
querySource() map[string]any
}
type rawQuery map[string]any
func (q rawQuery) querySource() map[string]any { return q }
// TermQuery matches documents whose `field` exactly equals `value`.
func TermQuery(field string, value any) Query {
return rawQuery{"term": map[string]any{field: value}}
}
// TermsQuery matches documents whose `field` equals any of `values`.
func TermsQuery(field string, values ...any) Query {
return rawQuery{"terms": map[string]any{field: values}}
}
// MatchQuery is a full-text match on a single field.
func MatchQuery(field string, value any) Query {
return rawQuery{"match": map[string]any{field: value}}
}
// MatchPhraseQuery matches the exact phrase on `field`.
func MatchPhraseQuery(field, value string) Query {
return rawQuery{"match_phrase": map[string]any{field: value}}
}
// MultiMatchQuery is the fluent builder for a multi_match query.
type MultiMatchQuery struct {
query any
fields []string
typ string
operator string
}
// NewMultiMatchQuery creates a multi_match query over the given fields.
func NewMultiMatchQuery(query any, fields ...string) *MultiMatchQuery {
return &MultiMatchQuery{query: query, fields: fields}
}
func (m *MultiMatchQuery) Type(t string) *MultiMatchQuery { m.typ = t; return m }
func (m *MultiMatchQuery) Operator(op string) *MultiMatchQuery { m.operator = op; return m }
func (m *MultiMatchQuery) querySource() map[string]any {
body := map[string]any{"query": m.query}
if len(m.fields) > 0 {
body["fields"] = m.fields
}
if m.typ != "" {
body["type"] = m.typ
}
if m.operator != "" {
body["operator"] = m.operator
}
return map[string]any{"multi_match": body}
}
// RangeQuery is the fluent builder for a range query.
type RangeQuery struct {
field string
body map[string]any
}
func NewRangeQuery(field string) *RangeQuery {
return &RangeQuery{field: field, body: map[string]any{}}
}
func (r *RangeQuery) Gte(v any) *RangeQuery { r.body["gte"] = v; return r }
func (r *RangeQuery) Lte(v any) *RangeQuery { r.body["lte"] = v; return r }
func (r *RangeQuery) querySource() map[string]any {
return map[string]any{"range": map[string]any{r.field: r.body}}
}
// BoolQuery is the fluent builder for a bool query.
type BoolQuery struct {
must []Query
should []Query
mustNot []Query
}
func NewBoolQuery() *BoolQuery { return &BoolQuery{} }
func (b *BoolQuery) Must(q ...Query) *BoolQuery { b.must = append(b.must, q...); return b }
func (b *BoolQuery) Should(q ...Query) *BoolQuery { b.should = append(b.should, q...); return b }
func (b *BoolQuery) MustNot(q ...Query) *BoolQuery { b.mustNot = append(b.mustNot, q...); return b }
func (b *BoolQuery) querySource() map[string]any {
body := map[string]any{}
if len(b.must) > 0 {
body["must"] = querySlice(b.must)
}
if len(b.should) > 0 {
body["should"] = querySlice(b.should)
}
if len(b.mustNot) > 0 {
body["must_not"] = querySlice(b.mustNot)
}
return map[string]any{"bool": body}
}
func querySlice(queries []Query) []map[string]any {
out := make([]map[string]any, len(queries))
for idx, q := range queries {
out[idx] = q.querySource()
}
return out
}
@@ -0,0 +1,76 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import "gitea.dev/modules/json"
const (
bulkActionIndex = "index"
bulkActionDelete = "delete"
)
// BulkOp is a single write inside a Bulk call. Construct with IndexOp or DeleteOp.
type BulkOp struct {
action string
id string
doc any
}
// IndexOp builds a bulk index operation.
func IndexOp(id string, doc any) BulkOp {
return BulkOp{action: bulkActionIndex, id: id, doc: doc}
}
// DeleteOp builds a bulk delete operation.
func DeleteOp(id string) BulkOp {
return BulkOp{action: bulkActionDelete, id: id}
}
// SortField is one entry of the search sort array.
type SortField struct {
Field string
Desc bool
}
func (s SortField) source() map[string]any {
order := "asc"
if s.Desc {
order = "desc"
}
return map[string]any{s.Field: map[string]any{"order": order}}
}
// SearchRequest captures everything Gitea sends to the _search endpoint.
// Aggregations and Highlight are raw ES JSON bodies — callers write them as
// map[string]any since each has exactly one call site with a fixed shape.
type SearchRequest struct {
Query Query
Sort []SortField
From int
Size int
TrackTotal bool
Aggregations map[string]any
Highlight map[string]any
}
// SearchHit is a single result row.
type SearchHit struct {
ID string
Score float64
Source json.Value
Highlight map[string][]string
}
// AggBucket is a terms-aggregation bucket.
type AggBucket struct {
Key any
DocCount int64
}
// SearchResponse is Gitea's decoded view of the search reply.
type SearchResponse struct {
Total int64
Hits []SearchHit
Aggregations map[string][]AggBucket
}
@@ -0,0 +1,34 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"context"
"fmt"
"gitea.dev/modules/log"
)
// VersionedIndexName returns the full index name with version suffix.
func (i *Indexer) VersionedIndexName() string {
return versionedIndexName(i.indexName, i.version)
}
func versionedIndexName(indexName string, version int) string {
if version == 0 {
// Old index name without version
return indexName
}
return fmt.Sprintf("%s.v%d", indexName, version)
}
func (i *Indexer) checkOldIndexes(ctx context.Context) {
for v := range i.version {
indexName := versionedIndexName(i.indexName, v)
exists, err := i.indexExists(ctx, indexName)
if err == nil && exists {
log.Warn("Found older elasticsearch index named %q, Gitea will keep the old NOT DELETED. You can delete the old version after the upgrade succeed.", indexName)
}
}
}
+37
View File
@@ -0,0 +1,37 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"context"
"errors"
)
// Indexer defines an basic indexer interface
type Indexer interface {
// Init initializes the indexer
// returns true if the index was opened/existed (with data populated), false if it was created/not-existed (with no data)
Init(ctx context.Context) (bool, error)
// Ping checks if the indexer is available
Ping(ctx context.Context) error
// Close closes the indexer
Close()
}
// NewDummyIndexer returns a dummy indexer
func NewDummyIndexer() Indexer {
return &dummyIndexer{}
}
type dummyIndexer struct{}
func (d *dummyIndexer) Init(ctx context.Context) (bool, error) {
return false, errors.New("indexer is not ready")
}
func (d *dummyIndexer) Ping(ctx context.Context) error {
return errors.New("indexer is not ready")
}
func (d *dummyIndexer) Close() {}
@@ -0,0 +1,119 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package meilisearch
import (
"fmt"
"strings"
)
// Filter represents a filter for meilisearch queries.
// It's just a simple wrapper around a string.
// DO NOT assume that it is a complete implementation.
type Filter interface {
Statement() string
}
type FilterAnd struct {
filters []Filter
}
func (f *FilterAnd) Statement() string {
var statements []string
for _, filter := range f.filters {
if s := filter.Statement(); s != "" {
statements = append(statements, fmt.Sprintf("(%s)", s))
}
}
return strings.Join(statements, " AND ")
}
func (f *FilterAnd) And(filter Filter) *FilterAnd {
f.filters = append(f.filters, filter)
return f
}
type FilterOr struct {
filters []Filter
}
func (f *FilterOr) Statement() string {
var statements []string
for _, filter := range f.filters {
if s := filter.Statement(); s != "" {
statements = append(statements, fmt.Sprintf("(%s)", s))
}
}
return strings.Join(statements, " OR ")
}
func (f *FilterOr) Or(filter Filter) *FilterOr {
f.filters = append(f.filters, filter)
return f
}
type FilterIn string
// NewFilterIn creates a new FilterIn.
// It supports int64 only, to avoid extra works to handle strings with special characters.
func NewFilterIn[T int64](field string, values ...T) FilterIn {
if len(values) == 0 {
return ""
}
vs := make([]string, len(values))
for i, v := range values {
vs[i] = fmt.Sprintf("%v", v)
}
return FilterIn(fmt.Sprintf("%s IN [%v]", field, strings.Join(vs, ", ")))
}
func (f FilterIn) Statement() string {
return string(f)
}
type FilterEq string
// NewFilterEq creates a new FilterEq.
// It supports int64 and bool only, to avoid extra works to handle strings with special characters.
func NewFilterEq[T bool | int64](field string, value T) FilterEq {
return FilterEq(fmt.Sprintf("%s = %v", field, value))
}
func (f FilterEq) Statement() string {
return string(f)
}
type FilterNot string
func NewFilterNot(filter Filter) FilterNot {
return FilterNot(fmt.Sprintf("NOT (%s)", filter.Statement()))
}
func (f FilterNot) Statement() string {
return string(f)
}
type FilterGte string
// NewFilterGte creates a new FilterGte.
// It supports int64 only, to avoid extra works to handle strings with special characters.
func NewFilterGte[T int64](field string, value T) FilterGte {
return FilterGte(fmt.Sprintf("%s >= %v", field, value))
}
func (f FilterGte) Statement() string {
return string(f)
}
type FilterLte string
// NewFilterLte creates a new FilterLte.
// It supports int64 only, to avoid extra works to handle strings with special characters.
func NewFilterLte[T int64](field string, value T) FilterLte {
return FilterLte(fmt.Sprintf("%s <= %v", field, value))
}
func (f FilterLte) Statement() string {
return string(f)
}
@@ -0,0 +1,88 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package meilisearch
import (
"context"
"errors"
"fmt"
"github.com/meilisearch/meilisearch-go"
)
// Indexer represents a basic meilisearch indexer implementation
type Indexer struct {
Client meilisearch.ServiceManager
url, apiKey string
indexName string
version int
settings *meilisearch.Settings
}
func NewIndexer(url, apiKey, indexName string, version int, settings *meilisearch.Settings) *Indexer {
return &Indexer{
url: url,
apiKey: apiKey,
indexName: indexName,
version: version,
settings: settings,
}
}
// Init initializes the indexer
func (i *Indexer) Init(_ context.Context) (bool, error) {
if i == nil {
return false, errors.New("cannot init nil indexer")
}
if i.Client != nil {
return false, errors.New("indexer is already initialized")
}
i.Client = meilisearch.New(i.url, meilisearch.WithAPIKey(i.apiKey))
_, err := i.Client.GetIndex(i.VersionedIndexName())
if err == nil {
return true, nil
}
_, err = i.Client.CreateIndex(&meilisearch.IndexConfig{
Uid: i.VersionedIndexName(),
PrimaryKey: "id",
})
if err != nil {
return false, err
}
i.checkOldIndexes()
_, err = i.Client.Index(i.VersionedIndexName()).UpdateSettings(i.settings)
return false, err
}
// Ping checks if the indexer is available
func (i *Indexer) Ping(ctx context.Context) error {
if i == nil {
return errors.New("cannot ping nil indexer")
}
if i.Client == nil {
return errors.New("indexer is not initialized")
}
resp, err := i.Client.Health()
if err != nil {
return err
}
if resp.Status != "available" {
// See https://docs.meilisearch.com/reference/api/health.html#status
return fmt.Errorf("status of meilisearch is not available: %s", resp.Status)
}
return nil
}
// Close closes the indexer
func (i *Indexer) Close() {
if i == nil {
return
}
i.Client = nil
}
@@ -0,0 +1,38 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package meilisearch
import (
"fmt"
"gitea.dev/modules/log"
)
// VersionedIndexName returns the full index name with version
func (i *Indexer) VersionedIndexName() string {
return versionedIndexName(i.indexName, i.version)
}
func versionedIndexName(indexName string, version int) string {
if version == 0 {
// Old index name without version
return indexName
}
// The format of the index name is <index_name>_v<version>, not <index_name>.v<version> like elasticsearch.
// Because meilisearch does not support "." in index name, it should contain only alphanumeric characters, hyphens (-) and underscores (_).
// See https://www.meilisearch.com/docs/learn/core_concepts/indexes#index-uid
return fmt.Sprintf("%s_v%d", indexName, version)
}
func (i *Indexer) checkOldIndexes() {
for v := 0; v < i.version; v++ {
indexName := versionedIndexName(i.indexName, v)
_, err := i.Client.GetIndex(indexName)
if err == nil {
log.Warn("Found older meilisearch index named %q, Gitea will keep the old NOT DELETED. You can delete the old version after the upgrade succeed.", indexName)
}
}
}
+34
View File
@@ -0,0 +1,34 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"math"
"gitea.dev/models/db"
)
// ParsePaginator parses a db.Paginator into a skip and limit
func ParsePaginator(paginator *db.ListOptions, maxNums ...int) (int, int) {
// Use a very large number to indicate no limit
unlimited := math.MaxInt32
if len(maxNums) > 0 {
// Some indexer engines have a limit on the page size, respect that
unlimited = maxNums[0]
}
if paginator == nil || paginator.IsListAll() {
// It shouldn't happen. In actual usage scenarios, there should not be requests to search all.
// But if it does happen, respect it and return "unlimited".
// And it's also useful for testing.
return 0, unlimited
}
if paginator.PageSize == 0 {
// Do not return any results when searching, it's used to get the total count only.
return 0, 0
}
return paginator.GetSkipTake()
}
+326
View File
@@ -0,0 +1,326 @@
// Copyright 2018 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"context"
"strconv"
"gitea.dev/modules/indexer"
indexer_internal "gitea.dev/modules/indexer/internal"
inner_bleve "gitea.dev/modules/indexer/internal/bleve"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/optional"
"gitea.dev/modules/util"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query"
)
const (
issueIndexerAnalyzer = "issueIndexer"
issueIndexerDocType = "issueIndexerDocType"
issueIndexerLatestVersion = 6
)
const unicodeNormalizeName = "unicodeNormalize"
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
"type": unicodenorm.Name,
"form": unicodenorm.NFC,
})
}
const maxBatchSize = 16
// IndexerData an update to the issue indexer
type IndexerData internal.IndexerData
// Type returns the document type, for bleve's mapping.Classifier interface.
func (i *IndexerData) Type() string {
return issueIndexerDocType
}
// generateIssueIndexMapping generates the bleve index mapping for issues
func generateIssueIndexMapping() (mapping.IndexMapping, error) {
mapping := bleve.NewIndexMapping()
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.Store = false
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("repo_id", numericFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.Store = false
textFieldMapping.IncludeInAll = false
boolFieldMapping := bleve.NewBooleanFieldMapping()
boolFieldMapping.Store = false
boolFieldMapping.IncludeInAll = false
numberFieldMapping := bleve.NewNumericFieldMapping()
numberFieldMapping.Store = false
numberFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("is_public", boolFieldMapping)
docMapping.AddFieldMappingsAt("title", textFieldMapping)
docMapping.AddFieldMappingsAt("content", textFieldMapping)
docMapping.AddFieldMappingsAt("comments", textFieldMapping)
docMapping.AddFieldMappingsAt("is_pull", boolFieldMapping)
docMapping.AddFieldMappingsAt("is_closed", boolFieldMapping)
docMapping.AddFieldMappingsAt("is_archived", boolFieldMapping)
docMapping.AddFieldMappingsAt("label_ids", numberFieldMapping)
docMapping.AddFieldMappingsAt("no_label", boolFieldMapping)
docMapping.AddFieldMappingsAt("milestone_id", numberFieldMapping)
docMapping.AddFieldMappingsAt("project_ids", numberFieldMapping)
docMapping.AddFieldMappingsAt("no_project", boolFieldMapping)
docMapping.AddFieldMappingsAt("poster_id", numberFieldMapping)
docMapping.AddFieldMappingsAt("assignee_id", numberFieldMapping)
docMapping.AddFieldMappingsAt("mention_ids", numberFieldMapping)
docMapping.AddFieldMappingsAt("reviewed_ids", numberFieldMapping)
docMapping.AddFieldMappingsAt("review_requested_ids", numberFieldMapping)
docMapping.AddFieldMappingsAt("subscriber_ids", numberFieldMapping)
docMapping.AddFieldMappingsAt("updated_unix", numberFieldMapping)
docMapping.AddFieldMappingsAt("created_unix", numberFieldMapping)
docMapping.AddFieldMappingsAt("deadline_unix", numberFieldMapping)
docMapping.AddFieldMappingsAt("comment_count", numberFieldMapping)
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err = mapping.AddCustomAnalyzer(issueIndexerAnalyzer, map[string]any{
"type": custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
}); err != nil {
return nil, err
}
mapping.DefaultAnalyzer = issueIndexerAnalyzer
mapping.AddDocumentMapping(issueIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
mapping.DefaultMapping = bleve.NewDocumentDisabledMapping() // disable default mapping, avoid indexing unexpected structs
return mapping, nil
}
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
inner *inner_bleve.Indexer
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWordsFuzzy()
}
// NewIndexer creates a new bleve local indexer
func NewIndexer(indexDir string) *Indexer {
inner := inner_bleve.NewIndexer(indexDir, issueIndexerLatestVersion, generateIssueIndexMapping)
return &Indexer{
Indexer: inner,
inner: inner,
}
}
// Index will save the index data
func (b *Indexer) Index(_ context.Context, issues ...*internal.IndexerData) error {
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
for _, issue := range issues {
if err := batch.Index(indexer_internal.Base36(issue.ID), (*IndexerData)(issue)); err != nil {
return err
}
}
return batch.Flush()
}
// Delete deletes indexes by ids
func (b *Indexer) Delete(_ context.Context, ids ...int64) error {
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
for _, id := range ids {
if err := batch.Delete(indexer_internal.Base36(id)); err != nil {
return err
}
}
return batch.Flush()
}
// Search searches for issues by given conditions.
// Returns the matching issue IDs
func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) {
var queries []query.Query
if options.Keyword != "" {
searchMode := util.IfZero(options.SearchMode, b.SupportedSearchModes()[0].ModeValue)
if searchMode == indexer.SearchModeWords || searchMode == indexer.SearchModeFuzzy {
fuzziness := 0
if searchMode == indexer.SearchModeFuzzy {
fuzziness = inner_bleve.GuessFuzzinessByKeyword(options.Keyword)
}
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.MatchAndQuery(options.Keyword, "title", issueIndexerAnalyzer, fuzziness),
inner_bleve.MatchAndQuery(options.Keyword, "content", issueIndexerAnalyzer, fuzziness),
inner_bleve.MatchAndQuery(options.Keyword, "comments", issueIndexerAnalyzer, fuzziness),
}...))
} else /* exact */ {
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer, 0),
inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer, 0),
inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer, 0),
}...))
}
}
if len(options.RepoIDs) > 0 || options.AllPublic {
var repoQueries []query.Query
for _, repoID := range options.RepoIDs {
repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "repo_id"))
}
if options.AllPublic {
repoQueries = append(repoQueries, inner_bleve.BoolFieldQuery(true, "is_public"))
}
queries = append(queries, bleve.NewDisjunctionQuery(repoQueries...))
}
if options.IsPull.Has() {
queries = append(queries, inner_bleve.BoolFieldQuery(options.IsPull.Value(), "is_pull"))
}
if options.IsClosed.Has() {
queries = append(queries, inner_bleve.BoolFieldQuery(options.IsClosed.Value(), "is_closed"))
}
if options.IsArchived.Has() {
queries = append(queries, inner_bleve.BoolFieldQuery(options.IsArchived.Value(), "is_archived"))
}
if options.NoLabelOnly {
queries = append(queries, inner_bleve.BoolFieldQuery(true, "no_label"))
} else {
if len(options.IncludedLabelIDs) > 0 {
var includeQueries []query.Query
for _, labelID := range options.IncludedLabelIDs {
includeQueries = append(includeQueries, inner_bleve.NumericEqualityQuery(labelID, "label_ids"))
}
queries = append(queries, bleve.NewConjunctionQuery(includeQueries...))
} else if len(options.IncludedAnyLabelIDs) > 0 {
var includeQueries []query.Query
for _, labelID := range options.IncludedAnyLabelIDs {
includeQueries = append(includeQueries, inner_bleve.NumericEqualityQuery(labelID, "label_ids"))
}
queries = append(queries, bleve.NewDisjunctionQuery(includeQueries...))
}
if len(options.ExcludedLabelIDs) > 0 {
var excludeQueries []query.Query
for _, labelID := range options.ExcludedLabelIDs {
q := bleve.NewBooleanQuery()
q.AddMustNot(inner_bleve.NumericEqualityQuery(labelID, "label_ids"))
excludeQueries = append(excludeQueries, q)
}
queries = append(queries, bleve.NewConjunctionQuery(excludeQueries...))
}
}
if len(options.MilestoneIDs) > 0 {
var milestoneQueries []query.Query
for _, milestoneID := range options.MilestoneIDs {
milestoneQueries = append(milestoneQueries, inner_bleve.NumericEqualityQuery(milestoneID, "milestone_id"))
}
queries = append(queries, bleve.NewDisjunctionQuery(milestoneQueries...))
}
if options.NoProjectOnly {
queries = append(queries, inner_bleve.BoolFieldQuery(true, "no_project"))
} else if len(options.ProjectIDs) > 0 {
var projectQueries []query.Query
for _, projectID := range options.ProjectIDs {
projectQueries = append(projectQueries, inner_bleve.NumericEqualityQuery(projectID, "project_ids"))
}
// FIXME: ISSUE-MULTIPLE-PROJECTS-FILTER: this logic is not right, it should use "AND" but not "OR"
queries = append(queries, bleve.NewDisjunctionQuery(projectQueries...))
}
if options.PosterID != "" {
// "(none)" becomes 0, it means no poster
posterIDInt64, _ := strconv.ParseInt(options.PosterID, 10, 64)
queries = append(queries, inner_bleve.NumericEqualityQuery(posterIDInt64, "poster_id"))
}
if options.AssigneeID != "" {
if options.AssigneeID == "(any)" {
queries = append(queries, inner_bleve.NumericRangeInclusiveQuery(optional.Some[int64](1), optional.None[int64](), "assignee_id"))
} else {
// "(none)" becomes 0, it means no assignee
assigneeIDInt64, _ := strconv.ParseInt(options.AssigneeID, 10, 64)
queries = append(queries, inner_bleve.NumericEqualityQuery(assigneeIDInt64, "assignee_id"))
}
}
if options.MentionID.Has() {
queries = append(queries, inner_bleve.NumericEqualityQuery(options.MentionID.Value(), "mention_ids"))
}
if options.ReviewedID.Has() {
queries = append(queries, inner_bleve.NumericEqualityQuery(options.ReviewedID.Value(), "reviewed_ids"))
}
if options.ReviewRequestedID.Has() {
queries = append(queries, inner_bleve.NumericEqualityQuery(options.ReviewRequestedID.Value(), "review_requested_ids"))
}
if options.SubscriberID.Has() {
queries = append(queries, inner_bleve.NumericEqualityQuery(options.SubscriberID.Value(), "subscriber_ids"))
}
if options.UpdatedAfterUnix.Has() || options.UpdatedBeforeUnix.Has() {
queries = append(queries, inner_bleve.NumericRangeInclusiveQuery(
options.UpdatedAfterUnix,
options.UpdatedBeforeUnix,
"updated_unix"))
}
var indexerQuery query.Query = bleve.NewConjunctionQuery(queries...)
if len(queries) == 0 {
indexerQuery = bleve.NewMatchAllQuery()
}
skip, limit := indexer_internal.ParsePaginator(options.Paginator)
search := bleve.NewSearchRequestOptions(indexerQuery, limit, skip, false)
if options.SortBy == "" {
options.SortBy = internal.SortByCreatedAsc
}
search.SortBy([]string{string(options.SortBy), "-_id"})
result, err := b.inner.Indexer.SearchInContext(ctx, search)
if err != nil {
return nil, err
}
ret := &internal.SearchResult{
Total: int64(result.Total),
Hits: make([]internal.Match, 0, len(result.Hits)),
}
for _, hit := range result.Hits {
id, err := indexer_internal.ParseBase36(hit.ID)
if err != nil {
return nil, err
}
ret.Hits = append(ret.Hits, internal.Match{
ID: id,
})
}
return ret, nil
}
@@ -0,0 +1,18 @@
// Copyright 2018 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"testing"
"gitea.dev/modules/indexer/issues/internal/tests"
)
func TestBleveIndexer(t *testing.T) {
dir := t.TempDir()
indexer := NewIndexer(dir)
defer indexer.Close()
tests.TestIndexer(t, indexer)
}
+143
View File
@@ -0,0 +1,143 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package db
import (
"context"
"strings"
"sync"
"gitea.dev/models/db"
issue_model "gitea.dev/models/issues"
"gitea.dev/modules/indexer"
indexer_internal "gitea.dev/modules/indexer/internal"
inner_db "gitea.dev/modules/indexer/internal/db"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/util"
"xorm.io/builder"
)
var _ internal.Indexer = (*Indexer)(nil)
// Indexer implements Indexer interface to use database's like search
type Indexer struct {
indexer_internal.Indexer
}
func (i *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWords()
}
var GetIndexer = sync.OnceValue(func() *Indexer {
return &Indexer{Indexer: &inner_db.Indexer{}}
})
// Index dummy function
func (i *Indexer) Index(_ context.Context, _ ...*internal.IndexerData) error {
return nil
}
// Delete dummy function
func (i *Indexer) Delete(_ context.Context, _ ...int64) error {
return nil
}
func buildMatchQuery(mode indexer.SearchModeType, colName, keyword string) builder.Cond {
if mode == indexer.SearchModeExact {
return db.BuildCaseInsensitiveLike(colName, keyword)
}
// match words
cond := builder.NewCond()
fields := strings.Fields(keyword)
if len(fields) == 0 {
return builder.Expr("1=1")
}
for _, field := range fields {
if field == "" {
continue
}
cond = cond.And(db.BuildCaseInsensitiveLike(colName, field))
}
return cond
}
// Search searches for issues
func (i *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) {
// FIXME: I tried to avoid importing models here, but it seems to be impossible.
// We can provide a function to register the search function, so models/issues can register it.
// So models/issues will import modules/indexer/issues, it's OK because it's by design.
// But modules/indexer/issues has already imported models/issues to do UpdateRepoIndexer and UpdateIssueIndexer.
// And to avoid circular import, we have to move the functions to another package.
// I believe it should be services/indexer, sounds great!
// But the two functions are used in modules/notification/indexer, that means we will import services/indexer in modules/notification/indexer.
// So that's the root problem:
// The notification is defined in modules, but it's using lots of things should be in services.
cond := builder.NewCond()
if options.Keyword != "" {
repoCond := builder.In("repo_id", options.RepoIDs)
if len(options.RepoIDs) == 1 {
repoCond = builder.Eq{"repo_id": options.RepoIDs[0]}
}
subQuery := builder.Select("id").From("issue").Where(repoCond)
searchMode := util.IfZero(options.SearchMode, i.SupportedSearchModes()[0].ModeValue)
cond = builder.Or(
buildMatchQuery(searchMode, "issue.name", options.Keyword),
buildMatchQuery(searchMode, "issue.content", options.Keyword),
builder.In("issue.id", builder.Select("issue_id").
From("comment").
Where(builder.And(
builder.Eq{"type": issue_model.CommentTypeComment},
builder.In("issue_id", subQuery),
buildMatchQuery(searchMode, "content", options.Keyword),
)),
),
)
if options.IsKeywordNumeric() {
cond = cond.Or(
builder.Eq{"`index`": options.Keyword},
)
}
}
opt, err := ToDBOptions(ctx, options)
if err != nil {
return nil, err
}
// If pagesize == 0, return total count only. It's a special case for search count.
if options.Paginator != nil && options.Paginator.PageSize == 0 {
total, err := issue_model.CountIssues(ctx, opt, cond)
if err != nil {
return nil, err
}
return &internal.SearchResult{
Total: total,
}, nil
}
return i.FindWithIssueOptions(ctx, opt, cond)
}
func (i *Indexer) FindWithIssueOptions(ctx context.Context, opt *issue_model.IssuesOptions, otherConds ...builder.Cond) (*internal.SearchResult, error) {
ids, total, err := issue_model.IssueIDs(ctx, opt, otherConds...)
if err != nil {
return nil, err
}
hits := make([]internal.Match, 0, len(ids))
for _, id := range ids {
hits = append(hits, internal.Match{
ID: id,
})
}
return &internal.SearchResult{
Total: total,
Hits: hits,
}, nil
}
+116
View File
@@ -0,0 +1,116 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package db
import (
"context"
"fmt"
"strings"
"gitea.dev/models/db"
issue_model "gitea.dev/models/issues"
"gitea.dev/modules/container"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/optional"
"gitea.dev/modules/util"
)
func ToDBOptions(ctx context.Context, options *internal.SearchOptions) (*issue_model.IssuesOptions, error) {
var sortType string
switch options.SortBy {
case internal.SortByCreatedAsc:
sortType = "oldest"
case internal.SortByUpdatedAsc:
sortType = "leastupdate"
case internal.SortByCommentsAsc:
sortType = "leastcomment"
case internal.SortByDeadlineDesc:
sortType = "farduedate"
case internal.SortByCreatedDesc:
sortType = "newest"
case internal.SortByUpdatedDesc:
sortType = "recentupdate"
case internal.SortByCommentsDesc:
sortType = "mostcomment"
case internal.SortByDeadlineAsc:
sortType = "nearduedate"
default:
if strings.HasPrefix(string(options.SortBy), issue_model.ScopeSortPrefix) {
sortType = string(options.SortBy)
} else {
sortType = "newest"
}
}
// See the comment of issues_model.SearchOptions for the reason why we need to convert
convertID := func(id optional.Option[int64]) int64 {
if !id.Has() {
return 0
}
value := id.Value()
if value == 0 {
return db.NoConditionID
}
return value
}
opts := &issue_model.IssuesOptions{
Paginator: options.Paginator,
RepoIDs: options.RepoIDs,
AllPublic: options.AllPublic,
RepoCond: nil,
AssigneeID: options.AssigneeID,
PosterID: options.PosterID,
MentionedID: convertID(options.MentionID),
ReviewRequestedID: convertID(options.ReviewRequestedID),
ReviewedID: convertID(options.ReviewedID),
SubscriberID: convertID(options.SubscriberID),
ProjectIDs: util.Iif(options.NoProjectOnly, []int64{db.NoConditionID}, options.ProjectIDs),
IsClosed: options.IsClosed,
IsPull: options.IsPull,
IncludedLabelNames: nil,
ExcludedLabelNames: nil,
IncludeMilestones: nil,
SortType: sortType,
UpdatedAfterUnix: options.UpdatedAfterUnix.Value(),
UpdatedBeforeUnix: options.UpdatedBeforeUnix.Value(),
PriorityRepoID: 0,
IsArchived: options.IsArchived,
Owner: nil,
Team: nil,
Doer: nil,
}
if len(options.MilestoneIDs) == 1 && options.MilestoneIDs[0] == 0 {
opts.MilestoneIDs = []int64{db.NoConditionID}
} else {
opts.MilestoneIDs = options.MilestoneIDs
}
if options.NoLabelOnly {
opts.LabelIDs = []int64{0} // Be careful, it's zero, not db.NoConditionID
} else {
opts.LabelIDs = make([]int64, 0, len(options.IncludedLabelIDs)+len(options.ExcludedLabelIDs))
opts.LabelIDs = append(opts.LabelIDs, options.IncludedLabelIDs...)
for _, id := range options.ExcludedLabelIDs {
opts.LabelIDs = append(opts.LabelIDs, -id)
}
if len(options.IncludedLabelIDs) == 0 && len(options.IncludedAnyLabelIDs) > 0 {
labels, err := issue_model.GetLabelsByIDs(ctx, options.IncludedAnyLabelIDs, "name")
if err != nil {
return nil, fmt.Errorf("GetLabelsByIDs: %v", err)
}
set := container.Set[string]{}
for _, label := range labels {
if !set.Contains(label.Name) {
set.Add(label.Name)
opts.IncludedLabelNames = append(opts.IncludedLabelNames, label.Name)
}
}
}
}
return opts, nil
}
+112
View File
@@ -0,0 +1,112 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package issues
import (
"strings"
"gitea.dev/models/db"
issues_model "gitea.dev/models/issues"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/optional"
"gitea.dev/modules/setting"
)
func ToSearchOptions(keyword string, opts *issues_model.IssuesOptions) *SearchOptions {
if opts.IssueIDs != nil {
setting.PanicInDevOrTesting("Indexer SearchOptions doesn't support IssueIDs")
}
searchOpt := &SearchOptions{
Keyword: keyword,
RepoIDs: opts.RepoIDs,
AllPublic: opts.AllPublic,
IsPull: opts.IsPull,
IsClosed: opts.IsClosed,
IsArchived: opts.IsArchived,
}
if len(opts.LabelIDs) == 1 && opts.LabelIDs[0] == 0 {
searchOpt.NoLabelOnly = true
} else {
for _, labelID := range opts.LabelIDs {
if labelID > 0 {
searchOpt.IncludedLabelIDs = append(searchOpt.IncludedLabelIDs, labelID)
} else {
searchOpt.ExcludedLabelIDs = append(searchOpt.ExcludedLabelIDs, -labelID)
}
}
// opts.IncludedLabelNames and opts.ExcludedLabelNames are not supported here.
// It's not a TO DO, it's just unnecessary.
}
if len(opts.MilestoneIDs) == 1 && opts.MilestoneIDs[0] == db.NoConditionID {
searchOpt.MilestoneIDs = []int64{0}
} else {
searchOpt.MilestoneIDs = opts.MilestoneIDs
}
if len(opts.ProjectIDs) == 1 && opts.ProjectIDs[0] == db.NoConditionID {
searchOpt.NoProjectOnly = true
} else {
searchOpt.ProjectIDs = opts.ProjectIDs
}
searchOpt.AssigneeID = opts.AssigneeID
// See the comment of issues_model.SearchOptions for the reason why we need to convert
convertID := func(id int64) optional.Option[int64] {
if id > 0 {
return optional.Some(id)
}
if id == db.NoConditionID {
return optional.None[int64]()
}
return nil
}
searchOpt.PosterID = opts.PosterID
searchOpt.MentionID = convertID(opts.MentionedID)
searchOpt.ReviewedID = convertID(opts.ReviewedID)
searchOpt.ReviewRequestedID = convertID(opts.ReviewRequestedID)
searchOpt.SubscriberID = convertID(opts.SubscriberID)
if opts.UpdatedAfterUnix > 0 {
searchOpt.UpdatedAfterUnix = optional.Some(opts.UpdatedAfterUnix)
}
if opts.UpdatedBeforeUnix > 0 {
searchOpt.UpdatedBeforeUnix = optional.Some(opts.UpdatedBeforeUnix)
}
searchOpt.Paginator = opts.Paginator
switch opts.SortType {
case "", "latest":
searchOpt.SortBy = SortByCreatedDesc
case "oldest":
searchOpt.SortBy = SortByCreatedAsc
case "recentupdate":
searchOpt.SortBy = SortByUpdatedDesc
case "leastupdate":
searchOpt.SortBy = SortByUpdatedAsc
case "mostcomment":
searchOpt.SortBy = SortByCommentsDesc
case "leastcomment":
searchOpt.SortBy = SortByCommentsAsc
case "nearduedate":
searchOpt.SortBy = SortByDeadlineAsc
case "farduedate":
searchOpt.SortBy = SortByDeadlineDesc
case "priority", "priorityrepo", "project-column-sorting":
// Unsupported sort type for search
fallthrough
default:
if strings.HasPrefix(opts.SortType, issues_model.ScopeSortPrefix) {
searchOpt.SortBy = internal.SortBy(opts.SortType)
} else {
searchOpt.SortBy = SortByUpdatedDesc
}
}
return searchOpt
}
@@ -0,0 +1,255 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"context"
"strconv"
"strings"
"gitea.dev/modules/graceful"
"gitea.dev/modules/indexer"
indexer_internal "gitea.dev/modules/indexer/internal"
es "gitea.dev/modules/indexer/internal/elasticsearch"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/util"
)
const issueIndexerLatestVersion = 3
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
*es.Indexer
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
// TODO: es supports fuzzy search, but our code doesn't at the moment, and actually the default fuzziness is already "AUTO"
return indexer.SearchModesExactWords()
}
// NewIndexer creates a new elasticsearch indexer
func NewIndexer(url, indexerName string) *Indexer {
return &Indexer{Indexer: es.NewIndexer(url, indexerName, issueIndexerLatestVersion, defaultMapping)}
}
const (
defaultMapping = `
{
"mappings": {
"properties": {
"id": { "type": "integer", "index": true },
"repo_id": { "type": "integer", "index": true },
"is_public": { "type": "boolean", "index": true },
"title": { "type": "text", "index": true },
"content": { "type": "text", "index": true },
"comments": { "type" : "text", "index": true },
"is_pull": { "type": "boolean", "index": true },
"is_closed": { "type": "boolean", "index": true },
"is_archived": { "type": "boolean", "index": true },
"label_ids": { "type": "integer", "index": true },
"no_label": { "type": "boolean", "index": true },
"milestone_id": { "type": "integer", "index": true },
"project_ids": { "type": "integer", "index": true },
"no_project": { "type": "boolean", "index": true },
"poster_id": { "type": "integer", "index": true },
"assignee_id": { "type": "integer", "index": true },
"mention_ids": { "type": "integer", "index": true },
"reviewed_ids": { "type": "integer", "index": true },
"review_requested_ids": { "type": "integer", "index": true },
"subscriber_ids": { "type": "integer", "index": true },
"updated_unix": { "type": "integer", "index": true },
"created_unix": { "type": "integer", "index": true },
"deadline_unix": { "type": "integer", "index": true },
"comment_count": { "type": "integer", "index": true }
}
}
}
`
)
// Index will save the index data
func (b *Indexer) Index(ctx context.Context, issues ...*internal.IndexerData) error {
if len(issues) == 0 {
return nil
} else if len(issues) == 1 {
issue := issues[0]
return b.Indexer.Index(ctx, strconv.FormatInt(issue.ID, 10), issue)
}
ops := make([]es.BulkOp, 0, len(issues))
for _, issue := range issues {
ops = append(ops, es.IndexOp(strconv.FormatInt(issue.ID, 10), issue))
}
return b.Bulk(graceful.GetManager().HammerContext(), ops)
}
// Delete deletes indexes by ids
func (b *Indexer) Delete(ctx context.Context, ids ...int64) error {
if len(ids) == 0 {
return nil
} else if len(ids) == 1 {
return b.Indexer.Delete(ctx, strconv.FormatInt(ids[0], 10))
}
ops := make([]es.BulkOp, 0, len(ids))
for _, id := range ids {
ops = append(ops, es.DeleteOp(strconv.FormatInt(id, 10)))
}
return b.Bulk(graceful.GetManager().HammerContext(), ops)
}
// Search searches for issues by given conditions.
// Returns the matching issue IDs
func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) {
query := es.NewBoolQuery()
if options.Keyword != "" {
searchMode := util.IfZero(options.SearchMode, b.SupportedSearchModes()[0].ModeValue)
mm := es.NewMultiMatchQuery(options.Keyword, "title", "content", "comments")
if searchMode == indexer.SearchModeExact {
mm = mm.Type(es.MultiMatchTypePhrasePrefix)
} else {
mm = mm.Type(es.MultiMatchTypeBestFields).Operator("and")
}
query.Must(mm)
}
if len(options.RepoIDs) > 0 {
q := es.NewBoolQuery()
q.Should(es.TermsQuery("repo_id", es.ToAnySlice(options.RepoIDs)...))
if options.AllPublic {
q.Should(es.TermQuery("is_public", true))
}
query.Must(q)
}
if options.IsPull.Has() {
query.Must(es.TermQuery("is_pull", options.IsPull.Value()))
}
if options.IsClosed.Has() {
query.Must(es.TermQuery("is_closed", options.IsClosed.Value()))
}
if options.IsArchived.Has() {
query.Must(es.TermQuery("is_archived", options.IsArchived.Value()))
}
if options.NoLabelOnly {
query.Must(es.TermQuery("no_label", true))
} else {
if len(options.IncludedLabelIDs) > 0 {
q := es.NewBoolQuery()
for _, labelID := range options.IncludedLabelIDs {
q.Must(es.TermQuery("label_ids", labelID))
}
query.Must(q)
} else if len(options.IncludedAnyLabelIDs) > 0 {
query.Must(es.TermsQuery("label_ids", es.ToAnySlice(options.IncludedAnyLabelIDs)...))
}
if len(options.ExcludedLabelIDs) > 0 {
q := es.NewBoolQuery()
for _, labelID := range options.ExcludedLabelIDs {
q.MustNot(es.TermQuery("label_ids", labelID))
}
query.Must(q)
}
}
if len(options.MilestoneIDs) > 0 {
query.Must(es.TermsQuery("milestone_id", es.ToAnySlice(options.MilestoneIDs)...))
}
if options.NoProjectOnly {
query.Must(es.TermQuery("no_project", true))
} else if len(options.ProjectIDs) > 0 {
// FIXME: ISSUE-MULTIPLE-PROJECTS-FILTER: this logic is not right, it should use "AND" but not "OR"
query.Must(es.TermsQuery("project_ids", es.ToAnySlice(options.ProjectIDs)...))
}
if options.PosterID != "" {
// "(none)" becomes 0, it means no poster
posterIDInt64, _ := strconv.ParseInt(options.PosterID, 10, 64)
query.Must(es.TermQuery("poster_id", posterIDInt64))
}
if options.AssigneeID != "" {
if options.AssigneeID == "(any)" {
query.Must(es.NewRangeQuery("assignee_id").Gte(1))
} else {
// "(none)" becomes 0, it means no assignee
assigneeIDInt64, _ := strconv.ParseInt(options.AssigneeID, 10, 64)
query.Must(es.TermQuery("assignee_id", assigneeIDInt64))
}
}
if options.MentionID.Has() {
query.Must(es.TermQuery("mention_ids", options.MentionID.Value()))
}
if options.ReviewedID.Has() {
query.Must(es.TermQuery("reviewed_ids", options.ReviewedID.Value()))
}
if options.ReviewRequestedID.Has() {
query.Must(es.TermQuery("review_requested_ids", options.ReviewRequestedID.Value()))
}
if options.SubscriberID.Has() {
query.Must(es.TermQuery("subscriber_ids", options.SubscriberID.Value()))
}
if options.UpdatedAfterUnix.Has() || options.UpdatedBeforeUnix.Has() {
q := es.NewRangeQuery("updated_unix")
if options.UpdatedAfterUnix.Has() {
q.Gte(options.UpdatedAfterUnix.Value())
}
if options.UpdatedBeforeUnix.Has() {
q.Lte(options.UpdatedBeforeUnix.Value())
}
query.Must(q)
}
if options.SortBy == "" {
options.SortBy = internal.SortByCreatedAsc
}
sortBy := []es.SortField{
parseSortBy(options.SortBy),
{Field: "id", Desc: true},
}
// See https://stackoverflow.com/questions/35206409/elasticsearch-2-1-result-window-is-too-large-index-max-result-window/35221900
// TODO: make it configurable since it's configurable in elasticsearch
const maxPageSize = 10000
skip, limit := indexer_internal.ParsePaginator(options.Paginator, maxPageSize)
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Sort: sortBy,
From: skip,
Size: limit,
TrackTotal: true,
})
if err != nil {
return nil, err
}
hits := make([]internal.Match, 0, len(resp.Hits))
for _, hit := range resp.Hits {
id, _ := strconv.ParseInt(hit.ID, 10, 64)
hits = append(hits, internal.Match{ID: id})
}
return &internal.SearchResult{
Total: resp.Total,
Hits: hits,
}, nil
}
func parseSortBy(sortBy internal.SortBy) es.SortField {
field, desc := strings.CutPrefix(string(sortBy), "-")
return es.SortField{Field: field, Desc: desc}
}
@@ -0,0 +1,53 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"fmt"
"net/http"
"net/url"
"testing"
"time"
"gitea.dev/modules/indexer/issues/internal/tests"
"gitea.dev/modules/test"
"github.com/stretchr/testify/require"
)
func TestElasticsearchIndexer(t *testing.T) {
// The elasticsearch instance started by pull-db-tests.yml > test-unit > services > elasticsearch
rawURL := test.ExternalServiceHTTP(t, "TEST_ELASTICSEARCH_URL", "http://elastic:changeme@elasticsearch:9200")
// Go's net/http does not auto-attach URL userinfo as Basic Auth, so extract
// it and set the header explicitly; otherwise auth-enforced clusters answer
// 401 and the probe never reports ready.
parsed, err := url.Parse(rawURL)
require.NoError(t, err)
user := parsed.User
parsed.User = nil
probeURL := parsed.String()
require.Eventually(t, func() bool {
req, err := http.NewRequest(http.MethodGet, probeURL, nil)
if err != nil {
return false
}
if user != nil {
pass, _ := user.Password()
req.SetBasicAuth(user.Username(), pass)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}, time.Minute, time.Second, "Expected elasticsearch to be up")
indexer := NewIndexer(rawURL, fmt.Sprintf("test_elasticsearch_indexer_%d", time.Now().Unix()))
defer indexer.Close()
tests.TestIndexer(t, indexer)
}
+327
View File
@@ -0,0 +1,327 @@
// Copyright 2018 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package issues
import (
"context"
"fmt"
"os"
"runtime/pprof"
"sync/atomic"
"time"
db_model "gitea.dev/models/db"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/graceful"
"gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/issues/bleve"
"gitea.dev/modules/indexer/issues/db"
"gitea.dev/modules/indexer/issues/elasticsearch"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/indexer/issues/meilisearch"
"gitea.dev/modules/log"
"gitea.dev/modules/optional"
"gitea.dev/modules/process"
"gitea.dev/modules/queue"
"gitea.dev/modules/setting"
"gitea.dev/modules/util"
)
// IndexerMetadata is used to send data to the queue, so it contains only the ids.
// It may look weird, because it has to be compatible with the old queue data format.
// If the IsDelete flag is true, the IDs specify the issues to delete from the index without querying the database.
// If the IsDelete flag is false, the ID specify the issue to index, so Indexer will query the database to get the issue data.
// It should be noted that if the id is not existing in the database, it's index will be deleted too even if IsDelete is false.
// Valid values:
// - IsDelete = true, IDs = [1, 2, 3], and ID will be ignored
// - IsDelete = false, ID = 1, and IDs will be ignored
type IndexerMetadata struct {
ID int64 `json:"id"`
IsDelete bool `json:"is_delete"`
IDs []int64 `json:"ids"`
}
var (
// issueIndexerQueue queue of issue ids to be updated
issueIndexerQueue *queue.WorkerPoolQueue[*IndexerMetadata]
// globalIndexer is the global indexer, it cannot be nil.
// When the real indexer is not ready, it will be a dummy indexer which will return error to explain it's not ready.
// So it's always safe use it as *globalIndexer.Load() and call its methods.
globalIndexer atomic.Pointer[internal.Indexer]
dummyIndexer *internal.Indexer
)
func init() {
i := internal.NewDummyIndexer()
dummyIndexer = &i
globalIndexer.Store(dummyIndexer)
}
// InitIssueIndexer initialize issue indexer, syncReindex is true then reindex until
// all issue index done.
func InitIssueIndexer(syncReindex bool) {
ctx, _, finished := process.GetManager().AddTypedContext(context.Background(), "Service: IssueIndexer", process.SystemProcessType, false)
indexerInitWaitChannel := make(chan time.Duration, 1)
// Create the Queue
issueIndexerQueue = queue.CreateUniqueQueue(ctx, "issue_indexer", getIssueIndexerQueueHandler(ctx))
graceful.GetManager().RunAtTerminate(finished)
// Create the Indexer
go func() {
pprof.SetGoroutineLabels(ctx)
start := time.Now()
log.Info("PID %d: Initializing Issue Indexer: %s", os.Getpid(), setting.Indexer.IssueType)
var (
issueIndexer internal.Indexer
existed bool
err error
)
switch setting.Indexer.IssueType {
case "bleve":
defer func() {
if err := recover(); err != nil {
log.Error("PANIC whilst initializing issue indexer: %v\nStacktrace: %s", err, log.Stack(2))
log.Error("The indexer files are likely corrupted and may need to be deleted")
log.Error("You can completely remove the %q directory to make Gitea recreate the indexes", setting.Indexer.IssuePath)
globalIndexer.Store(dummyIndexer)
log.Fatal("PID: %d Unable to initialize the Bleve Issue Indexer at path: %s Error: %v", os.Getpid(), setting.Indexer.IssuePath, err)
}
}()
issueIndexer = bleve.NewIndexer(setting.Indexer.IssuePath)
existed, err = issueIndexer.Init(ctx)
if err != nil {
log.Fatal("Unable to initialize Bleve Issue Indexer at path: %s Error: %v", setting.Indexer.IssuePath, err)
}
case "elasticsearch":
issueIndexer = elasticsearch.NewIndexer(setting.Indexer.IssueConnStr, setting.Indexer.IssueIndexerName)
existed, err = issueIndexer.Init(ctx)
if err != nil {
log.Fatal("Unable to issueIndexer.Init with connection %s Error: %v", util.SanitizeCredentialURLs(setting.Indexer.IssueConnStr), err)
}
case "db":
issueIndexer = db.GetIndexer()
case "meilisearch":
issueIndexer = meilisearch.NewIndexer(setting.Indexer.IssueConnStr, setting.Indexer.IssueConnAuth, setting.Indexer.IssueIndexerName)
existed, err = issueIndexer.Init(ctx)
if err != nil {
log.Fatal("Unable to issueIndexer.Init with connection %s Error: %v", util.SanitizeCredentialURLs(setting.Indexer.IssueConnStr), err)
}
default:
log.Fatal("Unknown issue indexer type: %s", setting.Indexer.IssueType)
}
globalIndexer.Store(&issueIndexer)
graceful.GetManager().RunAtTerminate(func() {
log.Debug("Closing issue indexer")
(*globalIndexer.Load()).Close()
log.Info("PID: %d Issue Indexer closed", os.Getpid())
})
// Start processing the queue
go graceful.GetManager().RunWithCancel(issueIndexerQueue)
// Populate the index
if !existed {
if syncReindex {
graceful.GetManager().RunWithShutdownContext(populateIssueIndexer)
} else {
go graceful.GetManager().RunWithShutdownContext(populateIssueIndexer)
}
}
indexerInitWaitChannel <- time.Since(start)
close(indexerInitWaitChannel)
}()
if syncReindex {
select {
case <-indexerInitWaitChannel:
case <-graceful.GetManager().IsShutdown():
}
} else if setting.Indexer.StartupTimeout > 0 {
go func() {
pprof.SetGoroutineLabels(ctx)
timeout := setting.Indexer.StartupTimeout
if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 {
timeout += setting.GracefulHammerTime
}
select {
case duration := <-indexerInitWaitChannel:
log.Info("Issue Indexer Initialization took %v", duration)
case <-graceful.GetManager().IsShutdown():
log.Warn("Shutdown occurred before issue index initialisation was complete")
case <-time.After(timeout):
issueIndexerQueue.ShutdownWait(5 * time.Second)
log.Fatal("Issue Indexer Initialization timed-out after: %v", timeout)
}
}()
}
}
func getIssueIndexerQueueHandler(ctx context.Context) func(items ...*IndexerMetadata) []*IndexerMetadata {
return func(items ...*IndexerMetadata) []*IndexerMetadata {
var unhandled []*IndexerMetadata
indexer := *globalIndexer.Load()
for _, item := range items {
log.Trace("IndexerMetadata Process: %d %v %t", item.ID, item.IDs, item.IsDelete)
if item.IsDelete {
if err := indexer.Delete(ctx, item.IDs...); err != nil {
log.Error("Issue indexer handler: failed to from index: %v Error: %v", item.IDs, err)
unhandled = append(unhandled, item)
}
continue
}
data, existed, err := getIssueIndexerData(ctx, item.ID)
if err != nil {
log.Error("Issue indexer handler: failed to get issue data of %d: %v", item.ID, err)
unhandled = append(unhandled, item)
continue
}
if !existed {
if err := indexer.Delete(ctx, item.ID); err != nil {
log.Error("Issue indexer handler: failed to delete issue %d from index: %v", item.ID, err)
unhandled = append(unhandled, item)
}
continue
}
if err := indexer.Index(ctx, data); err != nil {
log.Error("Issue indexer handler: failed to index issue %d: %v", item.ID, err)
unhandled = append(unhandled, item)
continue
}
}
return unhandled
}
}
// populateIssueIndexer populate the issue indexer with issue data
func populateIssueIndexer(ctx context.Context) {
ctx, _, finished := process.GetManager().AddTypedContext(ctx, "Service: PopulateIssueIndexer", process.SystemProcessType, true)
defer finished()
ctx = contextWithKeepRetry(ctx) // keep retrying since it's a background task
if err := PopulateIssueIndexer(ctx); err != nil {
log.Error("Issue indexer population failed: %v", err)
}
}
func PopulateIssueIndexer(ctx context.Context) error {
for page := 1; ; page++ {
select {
case <-ctx.Done():
return fmt.Errorf("shutdown before completion: %w", ctx.Err())
default:
}
repos, _, err := repo_model.SearchRepositoryByName(ctx, repo_model.SearchRepoOptions{
ListOptions: db_model.ListOptions{Page: page, PageSize: repo_model.RepositoryListDefaultPageSize},
OrderBy: db_model.SearchOrderByID,
Private: true,
Collaborate: optional.Some(false),
})
if err != nil {
log.Error("SearchRepositoryByName: %v", err)
continue
}
if len(repos) == 0 {
log.Debug("Issue Indexer population complete")
return nil
}
for _, repo := range repos {
if err := updateRepoIndexer(ctx, repo.ID); err != nil {
return fmt.Errorf("populate issue indexer for repo %d: %v", repo.ID, err)
}
}
}
}
// UpdateRepoIndexer add/update all issues of the repositories
func UpdateRepoIndexer(ctx context.Context, repoID int64) {
if err := updateRepoIndexer(ctx, repoID); err != nil {
log.Error("Unable to push repo %d to issue indexer: %v", repoID, err)
}
}
// UpdateIssueIndexer add/update an issue to the issue indexer
func UpdateIssueIndexer(ctx context.Context, issueID int64) {
if err := updateIssueIndexer(ctx, issueID); err != nil {
log.Error("Unable to push issue %d to issue indexer: %v", issueID, err)
}
}
// DeleteRepoIssueIndexer deletes repo's all issues indexes
func DeleteRepoIssueIndexer(ctx context.Context, repoID int64) {
if err := deleteRepoIssueIndexer(ctx, repoID); err != nil {
log.Error("Unable to push deleted repo %d to issue indexer: %v", repoID, err)
}
}
// IsAvailable checks if issue indexer is available
func IsAvailable(ctx context.Context) bool {
return (*globalIndexer.Load()).Ping(ctx) == nil
}
// SearchOptions indicates the options for searching issues
type SearchOptions = internal.SearchOptions
const (
SortByCreatedDesc = internal.SortByCreatedDesc
SortByUpdatedDesc = internal.SortByUpdatedDesc
SortByCommentsDesc = internal.SortByCommentsDesc
SortByDeadlineDesc = internal.SortByDeadlineDesc
SortByCreatedAsc = internal.SortByCreatedAsc
SortByUpdatedAsc = internal.SortByUpdatedAsc
SortByCommentsAsc = internal.SortByCommentsAsc
SortByDeadlineAsc = internal.SortByDeadlineAsc
)
// SearchIssues search issues by options.
func SearchIssues(ctx context.Context, opts *SearchOptions) ([]int64, int64, error) {
ix := *globalIndexer.Load()
if opts.Keyword == "" || opts.IsKeywordNumeric() {
// This is a conservative shortcut.
// If the keyword is empty or an integer, db has better (at least not worse) performance to filter issues.
// When the keyword is empty, it tends to listing rather than searching issues.
// So if the user creates an issue and list issues immediately, the issue may not be listed because the indexer needs time to index the issue.
// Even worse, the external indexer like elastic search may not be available for a while,
// and the user may not be able to list issues completely until it is available again.
ix = db.GetIndexer()
}
result, err := ix.Search(ctx, opts)
if err != nil {
return nil, 0, err
}
return SearchResultToIDSlice(result), result.Total, nil
}
func SearchResultToIDSlice(result *internal.SearchResult) []int64 {
ret := make([]int64, 0, len(result.Hits))
for _, hit := range result.Hits {
ret = append(ret, hit.ID)
}
return ret
}
// CountIssues counts issues by options. It is a shortcut of SearchIssues(ctx, opts) but only returns the total count.
func CountIssues(ctx context.Context, opts *SearchOptions) (int64, error) {
opts = opts.Copy(func(options *SearchOptions) { options.Paginator = &db_model.ListOptions{PageSize: 0} })
_, total, err := SearchIssues(ctx, opts)
return total, err
}
func SupportedSearchModes() []indexer.SearchMode {
gi := globalIndexer.Load()
if gi == nil {
return nil
}
return (*gi).SupportedSearchModes()
}
+501
View File
@@ -0,0 +1,501 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package issues
import (
"testing"
"gitea.dev/models/db"
"gitea.dev/models/issues"
"gitea.dev/models/unittest"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/optional"
"gitea.dev/modules/setting"
_ "gitea.dev/models"
_ "gitea.dev/models/actions"
_ "gitea.dev/models/activities"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMain(m *testing.M) {
unittest.MainTest(m)
}
func TestDBSearchIssues(t *testing.T) {
require.NoError(t, unittest.PrepareTestDatabase())
setting.Indexer.IssueType = "db"
InitIssueIndexer(true)
t.Run("search issues with keyword", searchIssueWithKeyword)
t.Run("search issues by index", searchIssueByIndex)
t.Run("search issues in repo", searchIssueInRepo)
t.Run("search issues by ID", searchIssueByID)
t.Run("search issues is pr", searchIssueIsPull)
t.Run("search issues is closed", searchIssueIsClosed)
t.Run("search issues is archived", searchIssueIsArchived)
t.Run("search issues by milestone", searchIssueByMilestoneID)
t.Run("search issues by label", searchIssueByLabelID)
t.Run("search issues by time", searchIssueByTime)
t.Run("search issues with order", searchIssueWithOrder)
t.Run("search issues in project", searchIssueInProject)
t.Run("search issues with paginator", searchIssueWithPaginator)
t.Run("search issues with any assignee", searchIssueWithAnyAssignee)
}
func searchIssueWithKeyword(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
Keyword: "issue2",
RepoIDs: []int64{1},
},
[]int64{2},
},
{
SearchOptions{
Keyword: "first",
RepoIDs: []int64{1},
},
[]int64{1},
},
{
SearchOptions{
Keyword: "for",
RepoIDs: []int64{1},
},
[]int64{11, 5, 3, 2, 1},
},
{
SearchOptions{
Keyword: "good",
RepoIDs: []int64{1},
},
[]int64{1},
},
}
for _, test := range tests {
t.Run(test.opts.Keyword, func(t *testing.T) {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
})
}
}
func searchIssueByIndex(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
Keyword: "1000",
RepoIDs: []int64{1},
},
[]int64{},
},
{
SearchOptions{
Keyword: "2",
RepoIDs: []int64{1, 2, 3, 32},
},
[]int64{17, 12, 7, 2},
},
{
SearchOptions{
Keyword: "1",
RepoIDs: []int64{58},
},
[]int64{19},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueInRepo(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
RepoIDs: []int64{1},
},
[]int64{11, 5, 3, 2, 1},
},
{
SearchOptions{
RepoIDs: []int64{2},
},
[]int64{7, 4},
},
{
SearchOptions{
RepoIDs: []int64{3},
},
[]int64{12, 6},
},
{
SearchOptions{
RepoIDs: []int64{4},
},
[]int64{},
},
{
SearchOptions{
RepoIDs: []int64{5},
},
[]int64{15},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueByID(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
opts: SearchOptions{
PosterID: "1",
},
expectedIDs: []int64{11, 6, 3, 2, 1},
},
{
opts: SearchOptions{
AssigneeID: "1",
},
expectedIDs: []int64{6, 1},
},
{
// NOTE: This tests no assignees filtering and also ToSearchOptions() to ensure it handles the filter correctly
opts: *ToSearchOptions("", &issues.IssuesOptions{AssigneeID: "(none)"}),
expectedIDs: []int64{22, 21, 16, 15, 14, 13, 12, 11, 20, 5, 19, 18, 10, 7, 4, 9, 8, 3, 2},
},
{
opts: SearchOptions{
MentionID: optional.Some(int64(4)),
},
expectedIDs: []int64{1},
},
{
opts: SearchOptions{
ReviewedID: optional.Some(int64(1)),
},
expectedIDs: []int64{},
},
{
opts: SearchOptions{
ReviewRequestedID: optional.Some(int64(1)),
},
expectedIDs: []int64{12},
},
{
opts: SearchOptions{
SubscriberID: optional.Some(int64(1)),
},
expectedIDs: []int64{11, 6, 5, 3, 2, 1},
},
{
// issue 20 request user 15 and team 5 which user 15 belongs to
// the review request number of issue 20 should be 1
opts: SearchOptions{
ReviewRequestedID: optional.Some(int64(15)),
},
expectedIDs: []int64{12, 20},
},
{
// user 20 approved the issue 20, so return nothing
opts: SearchOptions{
ReviewRequestedID: optional.Some(int64(20)),
},
expectedIDs: []int64{},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueIsPull(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
IsPull: optional.Some(false),
},
[]int64{17, 16, 15, 14, 13, 6, 5, 18, 10, 7, 4, 1},
},
{
SearchOptions{
IsPull: optional.Some(true),
},
[]int64{22, 21, 12, 11, 20, 19, 9, 8, 3, 2},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueIsClosed(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
IsClosed: optional.Some(false),
},
[]int64{22, 21, 17, 16, 15, 14, 13, 12, 11, 20, 6, 19, 18, 10, 7, 9, 8, 3, 2, 1},
},
{
SearchOptions{
IsClosed: optional.Some(true),
},
[]int64{5, 4},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueIsArchived(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
IsArchived: optional.Some(false),
},
[]int64{22, 21, 17, 16, 15, 13, 12, 11, 20, 6, 5, 19, 18, 10, 7, 4, 9, 8, 3, 2, 1},
},
{
SearchOptions{
IsArchived: optional.Some(true),
},
[]int64{14},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueByMilestoneID(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
MilestoneIDs: []int64{1},
},
[]int64{2},
},
{
SearchOptions{
MilestoneIDs: []int64{3},
},
[]int64{3},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueByLabelID(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
IncludedLabelIDs: []int64{1},
},
[]int64{2, 1},
},
{
SearchOptions{
IncludedLabelIDs: []int64{4},
},
[]int64{2},
},
{
SearchOptions{
ExcludedLabelIDs: []int64{1},
},
[]int64{22, 21, 17, 16, 15, 14, 13, 12, 11, 20, 6, 5, 19, 18, 10, 7, 4, 9, 8, 3},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueByTime(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
UpdatedAfterUnix: optional.Some(int64(0)),
},
[]int64{22, 21, 17, 16, 15, 14, 13, 12, 11, 20, 6, 5, 19, 18, 10, 7, 4, 9, 8, 3, 2, 1},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueWithOrder(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
SortBy: internal.SortByCreatedAsc,
},
[]int64{1, 2, 3, 8, 9, 4, 7, 10, 18, 19, 5, 6, 20, 11, 12, 13, 14, 15, 16, 17, 21, 22},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
}
func searchIssueInProject(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
}{
{
SearchOptions{
ProjectIDs: []int64{1},
},
[]int64{5, 3, 2, 1},
},
}
for _, test := range tests {
issueIDs, _, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
}
// Test filtering for issues with no project assigned using dynamic validation
t.Run("no project assigned", func(t *testing.T) {
issueIDs, total, err := SearchIssues(t.Context(), &SearchOptions{
ProjectIDs: []int64{db.NoConditionID},
})
require.NoError(t, err)
assert.NotEmpty(t, issueIDs)
assert.Equal(t, total, int64(len(issueIDs)))
// Verify each returned issue actually has no project
for _, issueID := range issueIDs {
issue, err := issues.GetIssueByID(t.Context(), issueID)
require.NoError(t, err)
err = issue.LoadProjects(t.Context())
require.NoError(t, err)
assert.Empty(t, issue.Projects, "Issue %d should have no projects", issueID)
}
// Count total issues with no project to verify we got them all
allIssues, err := issues.Issues(t.Context(), &issues.IssuesOptions{
ProjectIDs: []int64{db.NoConditionID},
})
require.NoError(t, err)
assert.Len(t, issueIDs, len(allIssues), "Should return all issues with no project")
})
}
func searchIssueWithPaginator(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
expectedTotal int64
}{
{
SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
},
[]int64{22, 21, 17, 16, 15},
22,
},
}
for _, test := range tests {
issueIDs, total, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
assert.Equal(t, test.expectedTotal, total)
}
}
func searchIssueWithAnyAssignee(t *testing.T) {
tests := []struct {
opts SearchOptions
expectedIDs []int64
expectedTotal int64
}{
{
SearchOptions{
AssigneeID: "(any)",
},
[]int64{17, 6, 1},
3,
},
}
for _, test := range tests {
issueIDs, total, err := SearchIssues(t.Context(), &test.opts)
require.NoError(t, err)
assert.Equal(t, test.expectedIDs, issueIDs)
assert.Equal(t, test.expectedTotal, total)
}
}
@@ -0,0 +1,48 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"context"
"errors"
"gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/internal"
)
// Indexer defines an interface to indexer issues contents
type Indexer interface {
internal.Indexer
Index(ctx context.Context, issue ...*IndexerData) error
Delete(ctx context.Context, ids ...int64) error
Search(ctx context.Context, options *SearchOptions) (*SearchResult, error)
SupportedSearchModes() []indexer.SearchMode
}
// NewDummyIndexer returns a dummy indexer
func NewDummyIndexer() Indexer {
return &dummyIndexer{
Indexer: internal.NewDummyIndexer(),
}
}
type dummyIndexer struct {
internal.Indexer
}
func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode {
return nil
}
func (d *dummyIndexer) Index(_ context.Context, _ ...*IndexerData) error {
return errors.New("indexer is not ready")
}
func (d *dummyIndexer) Delete(_ context.Context, _ ...int64) error {
return errors.New("indexer is not ready")
}
func (d *dummyIndexer) Search(_ context.Context, _ *SearchOptions) (*SearchResult, error) {
return nil, errors.New("indexer is not ready")
}
+161
View File
@@ -0,0 +1,161 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"strconv"
"gitea.dev/models/db"
"gitea.dev/modules/indexer"
"gitea.dev/modules/optional"
"gitea.dev/modules/timeutil"
)
// IndexerData data stored in the issue indexer
type IndexerData struct {
ID int64 `json:"id"`
RepoID int64 `json:"repo_id"`
IsPublic bool `json:"is_public"` // If the repo is public
// Fields used for keyword searching
Title string `json:"title"`
Content string `json:"content"`
Comments []string `json:"comments"`
// Fields used for filtering
IsPull bool `json:"is_pull"`
IsClosed bool `json:"is_closed"`
IsArchived bool `json:"is_archived"`
LabelIDs []int64 `json:"label_ids"`
NoLabel bool `json:"no_label"` // True if LabelIDs is empty
MilestoneID int64 `json:"milestone_id"`
ProjectIDs []int64 `json:"project_ids"`
NoProject bool `json:"no_project"` // True if ProjectIDs is empty
ProjectColumnMap map[int64]int64 `json:"project_column_map,omitempty"` // Maps project ID to column ID for each project the issue is in
PosterID int64 `json:"poster_id"`
AssigneeID int64 `json:"assignee_id"`
MentionIDs []int64 `json:"mention_ids"`
ReviewedIDs []int64 `json:"reviewed_ids"`
ReviewRequestedIDs []int64 `json:"review_requested_ids"`
SubscriberIDs []int64 `json:"subscriber_ids"`
UpdatedUnix timeutil.TimeStamp `json:"updated_unix"`
// Fields used for sorting
// UpdatedUnix is both used for filtering and sorting.
// ID is used for sorting too, to make the sorting stable.
CreatedUnix timeutil.TimeStamp `json:"created_unix"`
DeadlineUnix timeutil.TimeStamp `json:"deadline_unix"`
CommentCount int64 `json:"comment_count"`
}
// Match represents on search result
type Match struct {
ID int64 `json:"id"`
Score float64 `json:"score"`
}
// SearchResult represents search results
type SearchResult struct {
Total int64
Hits []Match
}
// SearchOptions represents search options.
//
// It has a slightly different design from database query options.
// In database query options, a field is never a pointer, so it could be confusing when it's zero value:
// Do you want to find data with a field value of 0, or do you not specify the field in the options?
// To avoid this confusion, db introduced db.NoConditionID(-1).
// So zero value means the field is not specified in the search options, and db.NoConditionID means "== 0" or "id NOT IN (SELECT id FROM ...)"
// It's still not ideal, it trapped developers many times.
// And sometimes -1 could be a valid value, like issue ID, negative numbers indicate exclusion.
// Since db.NoConditionID is for "db" (the package name is db), it makes sense not to use it in the indexer:
// Why do bleve/elasticsearch/meilisearch indexers need to know about db.NoConditionID?
// So in SearchOptions, we use pointer for fields which could be not specified,
// and always use the value to filter if it's not nil, even if it's zero or negative.
// It can handle almost all cases, if there is an exception, we can add a new field, like NoLabelOnly.
// Unfortunately, we still use db for the indexer and have to convert between db.NoConditionID and nil for legacy reasons.
type SearchOptions struct {
Keyword string // keyword to search
SearchMode indexer.SearchModeType
RepoIDs []int64 // repository IDs which the issues belong to
AllPublic bool // if include all public repositories
IsPull optional.Option[bool] // if the issues is a pull request
IsClosed optional.Option[bool] // if the issues is closed
IsArchived optional.Option[bool] // if the repo is archived
IncludedLabelIDs []int64 // labels the issues have
ExcludedLabelIDs []int64 // labels the issues don't have
IncludedAnyLabelIDs []int64 // labels the issues have at least one. It will be ignored if IncludedLabelIDs is not empty. It's an uncommon filter, but it has been supported accidentally by issues.IssuesOptions.IncludedLabelNames.
NoLabelOnly bool // if the issues have no label, if true, IncludedLabelIDs and ExcludedLabelIDs, IncludedAnyLabelIDs will be ignored
MilestoneIDs []int64 // milestones the issues have
ProjectIDs []int64 // project the issues belong to. FIXME: ISSUE-MULTIPLE-PROJECTS-FILTER: no multiple project filter support yet. Search logic is wrong.
NoProjectOnly bool // if the issues have no project, if true, ProjectIDs will be ignored
PosterID string // poster of the issues, "(none)" or "(any)" or a user ID
AssigneeID string // assignee of the issues, "(none)" or "(any)" or a user ID
MentionID optional.Option[int64] // mentioned user of the issues
ReviewedID optional.Option[int64] // reviewer of the issues
ReviewRequestedID optional.Option[int64] // requested reviewer of the issues
SubscriberID optional.Option[int64] // subscriber of the issues
UpdatedAfterUnix optional.Option[int64]
UpdatedBeforeUnix optional.Option[int64]
Paginator *db.ListOptions
SortBy SortBy // sort by field
}
// Copy returns a copy of the options.
// Be careful, it's not a deep copy, so `SearchOptions.RepoIDs = {...}` is OK while `SearchOptions.RepoIDs[0] = ...` is not.
func (o *SearchOptions) Copy(edit ...func(options *SearchOptions)) *SearchOptions {
if o == nil {
return nil
}
v := *o
for _, e := range edit {
e(&v)
}
return &v
}
// used for optimized issue index based search
func (o *SearchOptions) IsKeywordNumeric() bool {
_, err := strconv.Atoi(o.Keyword)
return err == nil
}
type SortBy string
const (
SortByCreatedDesc SortBy = "-created_unix"
SortByUpdatedDesc SortBy = "-updated_unix"
SortByCommentsDesc SortBy = "-comment_count"
SortByDeadlineDesc SortBy = "-deadline_unix"
SortByCreatedAsc SortBy = "created_unix"
SortByUpdatedAsc SortBy = "updated_unix"
SortByCommentsAsc SortBy = "comment_count"
SortByDeadlineAsc SortBy = "deadline_unix"
// Unsupported sort types which are supported by issues.IssuesOptions.SortType:
//
// - "priorityrepo":
// It's impossible to support it in the indexer.
// It is based on the specified repository in the request, so we cannot add static field to the indexer.
// If we do something like that query the issues in the specified repository first then append other issues,
// it will break the pagination.
//
// - "project-column-sorting":
// Although it's possible to support it by adding project.ProjectIssue.Sorting to the indexer,
// but what if the issue belongs to multiple projects?
// Since it's unsupported to search issues with keyword in project page, we don't need to support it.
)
@@ -0,0 +1,739 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
// This package contains tests for issues indexer modules.
// All the code in this package is only used for testing.
// Do not put any production code in this package to avoid it being included in the final binary.
package tests
import (
"fmt"
"slices"
"testing"
"time"
"gitea.dev/models/db"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/optional"
"gitea.dev/modules/timeutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestIndexer(t *testing.T, indexer internal.Indexer) {
_, err := indexer.Init(t.Context())
require.NoError(t, err)
require.NoError(t, indexer.Ping(t.Context()))
var (
ids []int64
data = map[int64]*internal.IndexerData{}
)
{
d := generateDefaultIndexerData()
for _, v := range d {
ids = append(ids, v.ID)
data[v.ID] = v
}
require.NoError(t, indexer.Index(t.Context(), d...))
waitData(t, indexer, int64(len(data)))
}
defer func() {
require.NoError(t, indexer.Delete(t.Context(), ids...))
}()
for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
if len(c.ExtraData) > 0 {
require.NoError(t, indexer.Index(t.Context(), c.ExtraData...))
for _, v := range c.ExtraData {
data[v.ID] = v
}
waitData(t, indexer, int64(len(data)))
defer func() {
for _, v := range c.ExtraData {
require.NoError(t, indexer.Delete(t.Context(), v.ID))
delete(data, v.ID)
}
waitData(t, indexer, int64(len(data)))
}()
}
result, err := indexer.Search(t.Context(), c.SearchOptions)
require.NoError(t, err)
if c.Expected != nil {
c.Expected(t, data, result)
} else {
ids := make([]int64, 0, len(result.Hits))
for _, hit := range result.Hits {
ids = append(ids, hit.ID)
}
assert.Equal(t, c.ExpectedIDs, ids)
assert.Equal(t, c.ExpectedTotal, result.Total)
}
// test counting
c.SearchOptions.Paginator = &db.ListOptions{PageSize: 0}
countResult, err := indexer.Search(t.Context(), c.SearchOptions)
require.NoError(t, err)
assert.Empty(t, countResult.Hits)
assert.Equal(t, result.Total, countResult.Total)
})
}
}
var cases = []*testIndexerCase{
{
Name: "default",
SearchOptions: &internal.SearchOptions{},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
},
},
{
Name: "empty",
SearchOptions: &internal.SearchOptions{
Keyword: "f1dfac73-fda6-4a6b-b8a4-2408fcb8ef69",
},
ExpectedIDs: []int64{},
ExpectedTotal: 0,
},
{
Name: "with limit",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
assert.Equal(t, len(data), int(result.Total))
},
},
{
// Exercises the single-doc Index/Delete fast path in backends that have one (e.g. Elasticsearch).
Name: "single-doc index",
ExtraData: []*internal.IndexerData{
{ID: 999, Title: "solo-issue-marker"},
},
SearchOptions: &internal.SearchOptions{Keyword: "solo-issue-marker"},
ExpectedIDs: []int64{999},
ExpectedTotal: 1,
},
{
Name: "Keyword",
ExtraData: []*internal.IndexerData{
{ID: 1000, Title: "hi hello world"},
{ID: 1001, Content: "hi hello world"},
{ID: 1002, Comments: []string{"hi", "hello world"}},
},
SearchOptions: &internal.SearchOptions{
Keyword: "hello",
},
ExpectedIDs: []int64{1002, 1001, 1000},
ExpectedTotal: 3,
},
{
Name: "RepoIDs",
ExtraData: []*internal.IndexerData{
{ID: 1001, Title: "hello world", RepoID: 1, IsPublic: false},
{ID: 1002, Title: "hello world", RepoID: 1, IsPublic: false},
{ID: 1003, Title: "hello world", RepoID: 2, IsPublic: true},
{ID: 1004, Title: "hello world", RepoID: 2, IsPublic: true},
{ID: 1005, Title: "hello world", RepoID: 3, IsPublic: true},
{ID: 1006, Title: "hello world", RepoID: 4, IsPublic: false},
{ID: 1007, Title: "hello world", RepoID: 5, IsPublic: false},
},
SearchOptions: &internal.SearchOptions{
Keyword: "hello",
RepoIDs: []int64{1, 4},
},
ExpectedIDs: []int64{1006, 1002, 1001},
ExpectedTotal: 3,
},
{
Name: "RepoIDs and AllPublic",
ExtraData: []*internal.IndexerData{
{ID: 1001, Title: "hello world", RepoID: 1, IsPublic: false},
{ID: 1002, Title: "hello world", RepoID: 1, IsPublic: false},
{ID: 1003, Title: "hello world", RepoID: 2, IsPublic: true},
{ID: 1004, Title: "hello world", RepoID: 2, IsPublic: true},
{ID: 1005, Title: "hello world", RepoID: 3, IsPublic: true},
{ID: 1006, Title: "hello world", RepoID: 4, IsPublic: false},
{ID: 1007, Title: "hello world", RepoID: 5, IsPublic: false},
},
SearchOptions: &internal.SearchOptions{
Keyword: "hello",
RepoIDs: []int64{1, 4},
AllPublic: true,
},
ExpectedIDs: []int64{1006, 1005, 1004, 1003, 1002, 1001},
ExpectedTotal: 6,
},
{
Name: "issue only",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
IsPull: optional.Some(false),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.False(t, data[v.ID].IsPull)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool { return !v.IsPull }), result.Total)
},
},
{
Name: "pull only",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
IsPull: optional.Some(true),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.True(t, data[v.ID].IsPull)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool { return v.IsPull }), result.Total)
},
},
{
Name: "opened only",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
IsClosed: optional.Some(false),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.False(t, data[v.ID].IsClosed)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool { return !v.IsClosed }), result.Total)
},
},
{
Name: "closed only",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
IsClosed: optional.Some(true),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.True(t, data[v.ID].IsClosed)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool { return v.IsClosed }), result.Total)
},
},
{
Name: "labels",
ExtraData: []*internal.IndexerData{
{ID: 1000, Title: "hello a", LabelIDs: []int64{2000, 2001, 2002}},
{ID: 1001, Title: "hello b", LabelIDs: []int64{2000, 2001}},
{ID: 1002, Title: "hello c", LabelIDs: []int64{2000, 2001, 2003}},
{ID: 1003, Title: "hello d", LabelIDs: []int64{2000}},
{ID: 1004, Title: "hello e", LabelIDs: []int64{}},
},
SearchOptions: &internal.SearchOptions{
Keyword: "hello",
IncludedLabelIDs: []int64{2000, 2001},
ExcludedLabelIDs: []int64{2003},
},
ExpectedIDs: []int64{1001, 1000},
ExpectedTotal: 2,
},
{
Name: "include any labels",
ExtraData: []*internal.IndexerData{
{ID: 1000, Title: "hello a", LabelIDs: []int64{2000, 2001, 2002}},
{ID: 1001, Title: "hello b", LabelIDs: []int64{2001}},
{ID: 1002, Title: "hello c", LabelIDs: []int64{2000, 2001, 2003}},
{ID: 1003, Title: "hello d", LabelIDs: []int64{2002}},
{ID: 1004, Title: "hello e", LabelIDs: []int64{}},
},
SearchOptions: &internal.SearchOptions{
Keyword: "hello",
IncludedAnyLabelIDs: []int64{2001, 2002},
ExcludedLabelIDs: []int64{2003},
},
ExpectedIDs: []int64{1003, 1001, 1000},
ExpectedTotal: 3,
},
{
Name: "MilestoneIDs",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
MilestoneIDs: []int64{1, 2, 6},
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Contains(t, []int64{1, 2, 6}, data[v.ID].MilestoneID)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return v.MilestoneID == 1 || v.MilestoneID == 2 || v.MilestoneID == 6
}), result.Total)
},
},
{
Name: "no MilestoneIDs",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
MilestoneIDs: []int64{0},
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Equal(t, int64(0), data[v.ID].MilestoneID)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return v.MilestoneID == 0
}), result.Total)
},
},
{
Name: "ProjectIDs",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
ProjectIDs: []int64{1},
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Contains(t, data[v.ID].ProjectIDs, int64(1))
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return slices.Contains(v.ProjectIDs, int64(1))
}), result.Total)
},
},
{
Name: "no ProjectIDs (empty array)",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 50,
},
NoProjectOnly: true,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
// Verify only issues with no projects are returned
for _, v := range result.Hits {
assert.Empty(t, data[v.ID].ProjectIDs, "Issue %d should have no projects", v.ID)
}
// Verify we got ALL issues with no projects
expectedCount := countIndexerData(data, func(v *internal.IndexerData) bool {
return len(v.ProjectIDs) == 0
})
assert.Equal(t, expectedCount, result.Total, "Should return all %d issues with no project", expectedCount)
},
},
{
Name: "PosterID",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
PosterID: "1",
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Equal(t, int64(1), data[v.ID].PosterID)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return v.PosterID == 1
}), result.Total)
},
},
{
Name: "AssigneeID",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
AssigneeID: "1",
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Equal(t, int64(1), data[v.ID].AssigneeID)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return v.AssigneeID == 1
}), result.Total)
},
},
{
Name: "no AssigneeID",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
AssigneeID: "(none)",
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Equal(t, int64(0), data[v.ID].AssigneeID)
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return v.AssigneeID == 0
}), result.Total)
},
},
{
Name: "MentionID",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
MentionID: optional.Some(int64(1)),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Contains(t, data[v.ID].MentionIDs, int64(1))
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return slices.Contains(v.MentionIDs, 1)
}), result.Total)
},
},
{
Name: "ReviewedID",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
ReviewedID: optional.Some(int64(1)),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Contains(t, data[v.ID].ReviewedIDs, int64(1))
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return slices.Contains(v.ReviewedIDs, 1)
}), result.Total)
},
},
{
Name: "ReviewRequestedID",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
ReviewRequestedID: optional.Some(int64(1)),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Contains(t, data[v.ID].ReviewRequestedIDs, int64(1))
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return slices.Contains(v.ReviewRequestedIDs, 1)
}), result.Total)
},
},
{
Name: "SubscriberID",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
SubscriberID: optional.Some(int64(1)),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.Contains(t, data[v.ID].SubscriberIDs, int64(1))
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return slices.Contains(v.SubscriberIDs, 1)
}), result.Total)
},
},
{
Name: "updated",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptions{
PageSize: 5,
},
UpdatedAfterUnix: optional.Some(int64(20)),
UpdatedBeforeUnix: optional.Some(int64(30)),
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 5)
for _, v := range result.Hits {
assert.GreaterOrEqual(t, data[v.ID].UpdatedUnix, int64(20))
assert.LessOrEqual(t, data[v.ID].UpdatedUnix, int64(30))
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return data[v.ID].UpdatedUnix >= 20 && data[v.ID].UpdatedUnix <= 30
}), result.Total)
},
},
{
Name: "SortByCreatedDesc",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptionsAll,
SortBy: internal.SortByCreatedDesc,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
for i, v := range result.Hits {
if i < len(result.Hits)-1 {
assert.GreaterOrEqual(t, data[v.ID].CreatedUnix, data[result.Hits[i+1].ID].CreatedUnix)
}
}
},
},
{
Name: "SortByUpdatedDesc",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptionsAll,
SortBy: internal.SortByUpdatedDesc,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
for i, v := range result.Hits {
if i < len(result.Hits)-1 {
assert.GreaterOrEqual(t, data[v.ID].UpdatedUnix, data[result.Hits[i+1].ID].UpdatedUnix)
}
}
},
},
{
Name: "SortByCommentsDesc",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptionsAll,
SortBy: internal.SortByCommentsDesc,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
for i, v := range result.Hits {
if i < len(result.Hits)-1 {
assert.GreaterOrEqual(t, data[v.ID].CommentCount, data[result.Hits[i+1].ID].CommentCount)
}
}
},
},
{
Name: "SortByDeadlineDesc",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptionsAll,
SortBy: internal.SortByDeadlineDesc,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
for i, v := range result.Hits {
if i < len(result.Hits)-1 {
assert.GreaterOrEqual(t, data[v.ID].DeadlineUnix, data[result.Hits[i+1].ID].DeadlineUnix)
}
}
},
},
{
Name: "SortByCreatedAsc",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptionsAll,
SortBy: internal.SortByCreatedAsc,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
for i, v := range result.Hits {
if i < len(result.Hits)-1 {
assert.LessOrEqual(t, data[v.ID].CreatedUnix, data[result.Hits[i+1].ID].CreatedUnix)
}
}
},
},
{
Name: "SortByUpdatedAsc",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptionsAll,
SortBy: internal.SortByUpdatedAsc,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
for i, v := range result.Hits {
if i < len(result.Hits)-1 {
assert.LessOrEqual(t, data[v.ID].UpdatedUnix, data[result.Hits[i+1].ID].UpdatedUnix)
}
}
},
},
{
Name: "SortByCommentsAsc",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptionsAll,
SortBy: internal.SortByCommentsAsc,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
for i, v := range result.Hits {
if i < len(result.Hits)-1 {
assert.LessOrEqual(t, data[v.ID].CommentCount, data[result.Hits[i+1].ID].CommentCount)
}
}
},
},
{
Name: "SortByDeadlineAsc",
SearchOptions: &internal.SearchOptions{
Paginator: &db.ListOptionsAll,
SortBy: internal.SortByDeadlineAsc,
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, len(data))
assert.Equal(t, len(data), int(result.Total))
for i, v := range result.Hits {
if i < len(result.Hits)-1 {
assert.LessOrEqual(t, data[v.ID].DeadlineUnix, data[result.Hits[i+1].ID].DeadlineUnix)
}
}
},
},
{
Name: "SearchAnyAssignee",
SearchOptions: &internal.SearchOptions{
AssigneeID: "(any)",
},
Expected: func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) {
assert.Len(t, result.Hits, 180)
for _, v := range result.Hits {
assert.GreaterOrEqual(t, data[v.ID].AssigneeID, int64(1))
}
assert.Equal(t, countIndexerData(data, func(v *internal.IndexerData) bool {
return v.AssigneeID >= 1
}), result.Total)
},
},
}
type testIndexerCase struct {
Name string
ExtraData []*internal.IndexerData
SearchOptions *internal.SearchOptions
Expected func(t *testing.T, data map[int64]*internal.IndexerData, result *internal.SearchResult) // if nil, use ExpectedIDs, ExpectedTotal
ExpectedIDs []int64
ExpectedTotal int64
}
func generateDefaultIndexerData() []*internal.IndexerData {
var id int64
var data []*internal.IndexerData
for repoID := int64(1); repoID <= 10; repoID++ {
for issueIndex := int64(1); issueIndex <= 20; issueIndex++ {
id++
comments := make([]string, id%4)
for i := range comments {
comments[i] = fmt.Sprintf("comment%d", i)
}
labelIDs := make([]int64, id%5)
for i := range labelIDs {
labelIDs[i] = int64(i) + 1 // LabelID should not be 0
}
mentionIDs := make([]int64, id%6)
for i := range mentionIDs {
mentionIDs[i] = int64(i) + 1 // MentionID should not be 0
}
reviewedIDs := make([]int64, id%7)
for i := range reviewedIDs {
reviewedIDs[i] = int64(i) + 1 // ReviewID should not be 0
}
reviewRequestedIDs := make([]int64, id%8)
for i := range reviewRequestedIDs {
reviewRequestedIDs[i] = int64(i) + 1 // ReviewRequestedID should not be 0
}
subscriberIDs := make([]int64, id%9)
for i := range subscriberIDs {
subscriberIDs[i] = int64(i) + 1 // SubscriberID should not be 0
}
projectIDs := make([]int64, id%5)
for i := range projectIDs {
projectIDs[i] = int64(i) + 1 // projectID should not be 0
}
data = append(data, &internal.IndexerData{
ID: id,
RepoID: repoID,
IsPublic: repoID%2 == 0,
Title: fmt.Sprintf("issue%d of repo%d", issueIndex, repoID),
Content: fmt.Sprintf("content%d", issueIndex),
Comments: comments,
IsPull: issueIndex%2 == 0,
IsClosed: issueIndex%3 == 0,
LabelIDs: labelIDs,
NoLabel: len(labelIDs) == 0,
MilestoneID: issueIndex % 4,
ProjectIDs: projectIDs,
NoProject: len(projectIDs) == 0,
PosterID: id%10 + 1, // PosterID should not be 0
AssigneeID: issueIndex % 10,
MentionIDs: mentionIDs,
ReviewedIDs: reviewedIDs,
ReviewRequestedIDs: reviewRequestedIDs,
SubscriberIDs: subscriberIDs,
UpdatedUnix: timeutil.TimeStamp(id + issueIndex),
CreatedUnix: timeutil.TimeStamp(id),
DeadlineUnix: timeutil.TimeStamp(id + issueIndex + repoID),
CommentCount: int64(len(comments)),
})
}
}
return data
}
func countIndexerData(data map[int64]*internal.IndexerData, f func(v *internal.IndexerData) bool) int64 {
var count int64
for _, v := range data {
if f(v) {
count++
}
}
return count
}
// waitData waits for the indexer to index all data.
// Some engines like Elasticsearch index data asynchronously, so we need to wait for a while.
func waitData(t *testing.T, indexer internal.Indexer, total int64) {
assert.Eventually(t, func() bool {
result, err := indexer.Search(t.Context(), &internal.SearchOptions{Paginator: &db.ListOptions{}})
require.NoError(t, err)
return result.Total == total
}, 10*time.Second, 100*time.Millisecond, "expected total=%d", total)
}
@@ -0,0 +1,314 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package meilisearch
import (
"context"
"errors"
"fmt"
"strconv"
"strings"
"gitea.dev/modules/indexer"
indexer_internal "gitea.dev/modules/indexer/internal"
inner_meilisearch "gitea.dev/modules/indexer/internal/meilisearch"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/json"
"github.com/meilisearch/meilisearch-go"
)
const (
issueIndexerLatestVersion = 5
// TODO: make this configurable if necessary
maxTotalHits = 10000
)
// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
inner *inner_meilisearch.Indexer
indexer_internal.Indexer // do not composite inner_meilisearch.Indexer directly to avoid exposing too much
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWords()
}
// NewIndexer creates a new meilisearch indexer
func NewIndexer(url, apiKey, indexerName string) *Indexer {
settings := &meilisearch.Settings{
// The default ranking rules of meilisearch are: ["words", "typo", "proximity", "attribute", "sort", "exactness"]
// So even if we specify the sort order, it could not be respected because the priority of "sort" is so low.
// So we need to specify the ranking rules to make sure the sort order is respected.
// See https://www.meilisearch.com/docs/learn/core_concepts/relevancy
RankingRules: []string{"sort", // make sure "sort" has the highest priority
"words", "typo", "proximity", "attribute", "exactness"},
SearchableAttributes: []string{
"title",
"content",
"comments",
},
DisplayedAttributes: []string{
"id",
"title",
"content",
"comments",
},
FilterableAttributes: []string{
"repo_id",
"is_public",
"is_pull",
"is_closed",
"is_archived",
"label_ids",
"no_label",
"milestone_id",
"project_ids",
"no_project",
"poster_id",
"assignee_id",
"mention_ids",
"reviewed_ids",
"review_requested_ids",
"subscriber_ids",
"updated_unix",
},
SortableAttributes: []string{
"updated_unix",
"created_unix",
"deadline_unix",
"comment_count",
"id",
},
Pagination: &meilisearch.Pagination{
MaxTotalHits: maxTotalHits,
},
}
inner := inner_meilisearch.NewIndexer(url, apiKey, indexerName, issueIndexerLatestVersion, settings)
indexer := &Indexer{
inner: inner,
Indexer: inner,
}
return indexer
}
// Index will save the index data
func (b *Indexer) Index(_ context.Context, issues ...*internal.IndexerData) error {
if len(issues) == 0 {
return nil
}
for _, issue := range issues {
// use default primary key which should be "id"
_, err := b.inner.Client.Index(b.inner.VersionedIndexName()).AddDocuments(issue, nil)
if err != nil {
return err
}
}
// TODO: bulk send index data
return nil
}
// Delete deletes indexes by ids
func (b *Indexer) Delete(_ context.Context, ids ...int64) error {
if len(ids) == 0 {
return nil
}
for _, id := range ids {
_, err := b.inner.Client.Index(b.inner.VersionedIndexName()).DeleteDocument(strconv.FormatInt(id, 10), nil)
if err != nil {
return err
}
}
// TODO: bulk send deletes
return nil
}
// Search searches for issues by given conditions.
// Returns the matching issue IDs
func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) {
query := inner_meilisearch.FilterAnd{}
if len(options.RepoIDs) > 0 {
q := &inner_meilisearch.FilterOr{}
q.Or(inner_meilisearch.NewFilterIn("repo_id", options.RepoIDs...))
if options.AllPublic {
q.Or(inner_meilisearch.NewFilterEq("is_public", true))
}
query.And(q)
}
if options.IsPull.Has() {
query.And(inner_meilisearch.NewFilterEq("is_pull", options.IsPull.Value()))
}
if options.IsClosed.Has() {
query.And(inner_meilisearch.NewFilterEq("is_closed", options.IsClosed.Value()))
}
if options.IsArchived.Has() {
query.And(inner_meilisearch.NewFilterEq("is_archived", options.IsArchived.Value()))
}
if options.NoLabelOnly {
query.And(inner_meilisearch.NewFilterEq("no_label", true))
} else {
if len(options.IncludedLabelIDs) > 0 {
q := &inner_meilisearch.FilterAnd{}
for _, labelID := range options.IncludedLabelIDs {
q.And(inner_meilisearch.NewFilterEq("label_ids", labelID))
}
query.And(q)
} else if len(options.IncludedAnyLabelIDs) > 0 {
query.And(inner_meilisearch.NewFilterIn("label_ids", options.IncludedAnyLabelIDs...))
}
if len(options.ExcludedLabelIDs) > 0 {
q := &inner_meilisearch.FilterAnd{}
for _, labelID := range options.ExcludedLabelIDs {
q.And(inner_meilisearch.NewFilterNot(inner_meilisearch.NewFilterEq("label_ids", labelID)))
}
query.And(q)
}
}
if len(options.MilestoneIDs) > 0 {
query.And(inner_meilisearch.NewFilterIn("milestone_id", options.MilestoneIDs...))
}
if options.NoProjectOnly {
query.And(inner_meilisearch.NewFilterEq("no_project", true))
} else if len(options.ProjectIDs) > 0 {
// FIXME: ISSUE-MULTIPLE-PROJECTS-FILTER: this logic is not right, it should use "AND" but not "OR"
query.And(inner_meilisearch.NewFilterIn("project_ids", options.ProjectIDs...))
}
if options.PosterID != "" {
// "(none)" becomes 0, it means no poster
posterIDInt64, _ := strconv.ParseInt(options.PosterID, 10, 64)
query.And(inner_meilisearch.NewFilterEq("poster_id", posterIDInt64))
}
if options.AssigneeID != "" {
if options.AssigneeID == "(any)" {
query.And(inner_meilisearch.NewFilterGte("assignee_id", 1))
} else {
// "(none)" becomes 0, it means no assignee
assigneeIDInt64, _ := strconv.ParseInt(options.AssigneeID, 10, 64)
query.And(inner_meilisearch.NewFilterEq("assignee_id", assigneeIDInt64))
}
}
if options.MentionID.Has() {
query.And(inner_meilisearch.NewFilterEq("mention_ids", options.MentionID.Value()))
}
if options.ReviewedID.Has() {
query.And(inner_meilisearch.NewFilterEq("reviewed_ids", options.ReviewedID.Value()))
}
if options.ReviewRequestedID.Has() {
query.And(inner_meilisearch.NewFilterEq("review_requested_ids", options.ReviewRequestedID.Value()))
}
if options.SubscriberID.Has() {
query.And(inner_meilisearch.NewFilterEq("subscriber_ids", options.SubscriberID.Value()))
}
if options.UpdatedAfterUnix.Has() {
query.And(inner_meilisearch.NewFilterGte("updated_unix", options.UpdatedAfterUnix.Value()))
}
if options.UpdatedBeforeUnix.Has() {
query.And(inner_meilisearch.NewFilterLte("updated_unix", options.UpdatedBeforeUnix.Value()))
}
if options.SortBy == "" {
options.SortBy = internal.SortByCreatedAsc
}
sortBy := []string{
parseSortBy(options.SortBy),
"id:desc",
}
skip, limit := indexer_internal.ParsePaginator(options.Paginator, maxTotalHits)
counting := limit == 0
if counting {
// If set limit to 0, it will be 20 by default, and -1 is not allowed.
// See https://www.meilisearch.com/docs/reference/api/search#limit
// So set limit to 1 to make the cost as low as possible, then clear the result before returning.
limit = 1
}
keyword := options.Keyword // default to match "words"
if options.SearchMode == indexer.SearchModeExact {
// https://www.meilisearch.com/docs/reference/api/search#phrase-search
keyword = doubleQuoteKeyword(keyword)
}
searchRes, err := b.inner.Client.Index(b.inner.VersionedIndexName()).Search(keyword, &meilisearch.SearchRequest{
Filter: query.Statement(),
Limit: int64(limit),
Offset: int64(skip),
Sort: sortBy,
MatchingStrategy: "all",
})
if err != nil {
return nil, err
}
if counting {
searchRes.Hits = nil
}
hits, err := convertHits(searchRes)
if err != nil {
return nil, err
}
return &internal.SearchResult{
Total: searchRes.EstimatedTotalHits,
Hits: hits,
}, nil
}
func parseSortBy(sortBy internal.SortBy) string {
field := strings.TrimPrefix(string(sortBy), "-")
if strings.HasPrefix(string(sortBy), "-") {
return field + ":desc"
}
return field + ":asc"
}
func doubleQuoteKeyword(k string) string {
kp := strings.Split(k, " ")
parts := 0
for i := range kp {
part := strings.Trim(kp[i], "\"")
if part != "" {
kp[parts] = fmt.Sprintf(`"%s"`, part)
parts++
}
}
return strings.Join(kp[:parts], " ")
}
func convertHits(searchRes *meilisearch.SearchResponse) ([]internal.Match, error) {
hits := make([]internal.Match, 0, len(searchRes.Hits))
for _, hit := range searchRes.Hits {
var issueID int64
if err := json.Unmarshal(hit["id"], &issueID); err != nil {
return nil, ErrMalformedResponse
}
hits = append(hits, internal.Match{
ID: issueID,
})
}
return hits, nil
}
@@ -0,0 +1,94 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package meilisearch
import (
"fmt"
"net/http"
"os"
"testing"
"time"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/indexer/issues/internal/tests"
"gitea.dev/modules/json"
"gitea.dev/modules/test"
"github.com/meilisearch/meilisearch-go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMeilisearchIndexer(t *testing.T) {
// The meilisearch instance started by pull-db-tests.yml > test-unit > services > meilisearch
url := test.ExternalServiceHTTP(t, "TEST_MEILISEARCH_URL", "http://meilisearch:7700")
key := os.Getenv("TEST_MEILISEARCH_KEY")
require.Eventually(t, func() bool {
resp, err := http.Get(url)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}, time.Minute, time.Second, "Expected meilisearch to be up")
indexer := NewIndexer(url, key, fmt.Sprintf("test_meilisearch_indexer_%d", time.Now().Unix()))
defer indexer.Close()
tests.TestIndexer(t, indexer)
}
func TestConvertHits(t *testing.T) {
convert := func(d any) []byte {
b, _ := json.Marshal(d)
return b
}
_, err := convertHits(&meilisearch.SearchResponse{
Hits: []meilisearch.Hit{
{
"aa": convert(1),
"bb": convert(2),
"cc": convert(3),
"dd": convert(4),
},
},
})
assert.ErrorIs(t, err, ErrMalformedResponse)
validResponse := &meilisearch.SearchResponse{
Hits: []meilisearch.Hit{
{
"id": convert(float64(11)),
"title": convert("a title"),
"content": convert("issue body with no match"),
"comments": convert([]any{"hey whats up?", "I'm currently bowling", "nice"}),
},
{
"id": convert(float64(22)),
"title": convert("Bowling as title"),
"content": convert(""),
"comments": convert([]any{}),
},
{
"id": convert(float64(33)),
"title": convert("Bowl-ing as fuzzy match"),
"content": convert(""),
"comments": convert([]any{}),
},
},
}
hits, err := convertHits(validResponse)
assert.NoError(t, err)
assert.Equal(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits)
}
func TestDoubleQuoteKeyword(t *testing.T) {
assert.Empty(t, doubleQuoteKeyword(""))
assert.Equal(t, `"a" "b" "c"`, doubleQuoteKeyword("a b c"))
assert.Equal(t, `"a" "d" "g"`, doubleQuoteKeyword("a d g"))
assert.Equal(t, `"a" "d" "g"`, doubleQuoteKeyword("a d g"))
assert.Equal(t, `"a" "d" "g"`, doubleQuoteKeyword(`a "" "d" """g`))
}
+198
View File
@@ -0,0 +1,198 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package issues
import (
"context"
"errors"
"fmt"
"gitea.dev/models/db"
issue_model "gitea.dev/models/issues"
"gitea.dev/modules/container"
"gitea.dev/modules/indexer/issues/internal"
"gitea.dev/modules/log"
"gitea.dev/modules/queue"
)
// getIssueIndexerData returns the indexer data of an issue and a bool value indicating whether the issue exists.
func getIssueIndexerData(ctx context.Context, issueID int64) (*internal.IndexerData, bool, error) {
issue, err := issue_model.GetIssueByID(ctx, issueID)
if err != nil {
if issue_model.IsErrIssueNotExist(err) {
return nil, false, nil
}
return nil, false, err
}
// FIXME: what if users want to search for a review comment of a pull request?
// The comment type is CommentTypeCode or CommentTypeReview.
// But LoadDiscussComments only loads CommentTypeComment.
if err := issue.LoadDiscussComments(ctx); err != nil {
return nil, false, err
}
comments := make([]string, 0, len(issue.Comments))
for _, comment := range issue.Comments {
if comment.Content != "" {
// what ever the comment type is, index the content if it is not empty.
comments = append(comments, comment.Content)
}
}
if err := issue.LoadAttributes(ctx); err != nil {
return nil, false, err
}
labels := make([]int64, 0, len(issue.Labels))
for _, label := range issue.Labels {
labels = append(labels, label.ID)
}
mentionIDs, err := issue_model.GetIssueMentionIDs(ctx, issueID)
if err != nil {
return nil, false, err
}
var (
reviewedIDs []int64
reviewRequestedIDs []int64
)
{
reviews, err := issue_model.FindReviews(ctx, issue_model.FindReviewOptions{
ListOptions: db.ListOptionsAll,
IssueID: issueID,
OfficialOnly: false,
})
if err != nil {
return nil, false, err
}
reviewedIDsSet := make(container.Set[int64], len(reviews))
reviewRequestedIDsSet := make(container.Set[int64], len(reviews))
for _, review := range reviews {
if review.Type == issue_model.ReviewTypeRequest {
reviewRequestedIDsSet.Add(review.ReviewerID)
} else {
reviewedIDsSet.Add(review.ReviewerID)
}
}
reviewedIDs = reviewedIDsSet.Values()
reviewRequestedIDs = reviewRequestedIDsSet.Values()
}
subscriberIDs, err := issue_model.GetIssueWatchersIDs(ctx, issue.ID, true)
if err != nil {
return nil, false, err
}
projectIDs := make([]int64, 0, len(issue.Projects))
for _, project := range issue.Projects {
projectIDs = append(projectIDs, project.ID)
}
if err := issue.Repo.LoadOwner(ctx); err != nil {
return nil, false, fmt.Errorf("issue.Repo.LoadOwner: %w", err)
}
return &internal.IndexerData{
ID: issue.ID,
RepoID: issue.RepoID,
IsPublic: !issue.Repo.IsPrivate && issue.Repo.Owner.Visibility.IsPublic(),
Title: issue.Title,
Content: issue.Content,
Comments: comments,
IsPull: issue.IsPull,
IsClosed: issue.IsClosed,
IsArchived: issue.Repo.IsArchived,
LabelIDs: labels,
NoLabel: len(labels) == 0,
MilestoneID: issue.MilestoneID,
ProjectIDs: projectIDs,
NoProject: len(projectIDs) == 0,
PosterID: issue.PosterID,
AssigneeID: issue.AssigneeID,
MentionIDs: mentionIDs,
ReviewedIDs: reviewedIDs,
ReviewRequestedIDs: reviewRequestedIDs,
SubscriberIDs: subscriberIDs,
UpdatedUnix: issue.UpdatedUnix,
CreatedUnix: issue.CreatedUnix,
DeadlineUnix: issue.DeadlineUnix,
CommentCount: int64(len(issue.Comments)),
}, true, nil
}
func updateRepoIndexer(ctx context.Context, repoID int64) error {
ids, err := issue_model.GetIssueIDsByRepoID(ctx, repoID)
if err != nil {
return fmt.Errorf("issue_model.GetIssueIDsByRepoID: %w", err)
}
for _, id := range ids {
if err := updateIssueIndexer(ctx, id); err != nil {
return err
}
}
return nil
}
func updateIssueIndexer(ctx context.Context, issueID int64) error {
return pushIssueIndexerQueue(ctx, &IndexerMetadata{ID: issueID})
}
func deleteRepoIssueIndexer(ctx context.Context, repoID int64) error {
var ids []int64
ids, err := issue_model.GetIssueIDsByRepoID(ctx, repoID)
if err != nil {
return fmt.Errorf("issue_model.GetIssueIDsByRepoID: %w", err)
}
if len(ids) == 0 {
return nil
}
return pushIssueIndexerQueue(ctx, &IndexerMetadata{
IDs: ids,
IsDelete: true,
})
}
type keepRetryKey struct{}
// contextWithKeepRetry returns a context with a key indicating that the indexer should keep retrying.
// Please note that it's for background tasks only, and it should not be used for user requests, or it may cause blocking.
func contextWithKeepRetry(ctx context.Context) context.Context {
return context.WithValue(ctx, keepRetryKey{}, true)
}
func pushIssueIndexerQueue(ctx context.Context, data *IndexerMetadata) error {
if issueIndexerQueue == nil {
// Some unit tests will trigger indexing, but the queue is not initialized.
// It's OK to ignore it, but log a warning message in case it's not a unit test.
log.Warn("Trying to push %+v to issue indexer queue, but the queue is not initialized, it's OK if it's a unit test", data)
return nil
}
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
err := issueIndexerQueue.Push(data)
if errors.Is(err, queue.ErrAlreadyInQueue) {
return nil
}
if errors.Is(err, context.DeadlineExceeded) { // the queue is full
log.Warn("It seems that issue indexer is slow and the queue is full. Please check the issue indexer or increase the queue size.")
if ctx.Value(keepRetryKey{}) == nil {
return err
}
// It will be better to increase the queue size instead of retrying, but users may ignore the previous warning message.
// However, even it retries, it may still cause index loss when there's a deadline in the context.
log.Debug("Retry to push %+v to issue indexer queue", data)
continue
}
return err
}
}
+85
View File
@@ -0,0 +1,85 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package stats
import (
"fmt"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/git"
"gitea.dev/modules/git/languagestats"
"gitea.dev/modules/gitrepo"
"gitea.dev/modules/graceful"
"gitea.dev/modules/log"
"gitea.dev/modules/process"
"gitea.dev/modules/setting"
)
// DBIndexer implements Indexer interface to use database's like search
type DBIndexer struct{}
// Index repository status function
func (db *DBIndexer) Index(id int64) error {
ctx, _, finished := process.GetManager().AddContext(graceful.GetManager().ShutdownContext(), fmt.Sprintf("Stats.DB Index Repo[%d]", id))
defer finished()
repo, err := repo_model.GetRepositoryByID(ctx, id)
if err != nil {
return err
}
if repo.IsEmpty {
return nil
}
status, err := repo_model.GetIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeStats)
if err != nil {
return err
}
gitRepo, err := gitrepo.OpenRepository(ctx, repo)
if err != nil {
if err.Error() == "no such file or directory" {
return nil
}
return err
}
defer gitRepo.Close()
// Get latest commit for default branch
commitID, err := gitRepo.GetBranchCommitID(repo.DefaultBranch)
if err != nil {
if git.IsErrBranchNotExist(err) || git.IsErrNotExist(err) || setting.IsInTesting {
log.Debug("Unable to get commit ID for default branch %s in %s ... skipping this repository", repo.DefaultBranch, repo.FullName())
return nil
}
log.Error("Unable to get commit ID for default branch %s in %s. Error: %v", repo.DefaultBranch, repo.FullName(), err)
return err
}
// Do not recalculate stats if already calculated for this commit
if status.CommitSha == commitID {
return nil
}
// Calculate and save language statistics to database
stats, err := languagestats.GetLanguageStats(gitRepo, commitID)
if err != nil {
if !setting.IsInTesting {
log.Error("Unable to get language stats for ID %s for default branch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.FullName(), err)
}
return err
}
err = repo_model.UpdateLanguageStats(ctx, repo, commitID, stats)
if err != nil {
log.Error("Unable to update language stats for ID %s for default branch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.FullName(), err)
return err
}
log.Debug("DBIndexer completed language stats for ID %s for default branch %s in %s. stats count: %d", commitID, repo.DefaultBranch, repo.FullName(), len(stats))
return nil
}
// Close dummy function
func (db *DBIndexer) Close() {
}
+88
View File
@@ -0,0 +1,88 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package stats
import (
"context"
"gitea.dev/models/db"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/graceful"
"gitea.dev/modules/log"
)
// Indexer defines an interface to index repository stats
// TODO: this indexer is quite different from the others, maybe this package should be moved out from module/indexer
type Indexer interface {
Index(id int64) error
Close()
}
// indexer represents a indexer instance
var indexer Indexer
// Init initialize the repo indexer
func Init() error {
indexer = &DBIndexer{}
if err := initStatsQueue(); err != nil {
return err
}
go populateRepoIndexer(graceful.GetManager().ShutdownContext())
return nil
}
// populateRepoIndexer populate the repo indexer with pre-existing data. This
// should only be run when the indexer is created for the first time.
func populateRepoIndexer(ctx context.Context) {
log.Info("Populating the repo stats indexer with existing repositories")
isShutdown := graceful.GetManager().IsShutdown()
exist, err := db.IsTableNotEmpty("repository")
if err != nil {
log.Fatal("System error: %v", err)
} else if !exist {
return
}
var maxRepoID int64
if maxRepoID, err = db.GetMaxID("repository"); err != nil {
log.Fatal("System error: %v", err)
}
// start with the maximum existing repo ID and work backwards, so that we
// don't include repos that are created after gitea starts; such repos will
// already be added to the indexer, and we don't need to add them again.
for maxRepoID > 0 {
select {
case <-isShutdown:
log.Info("Repository Stats Indexer population shutdown before completion")
return
default:
}
ids, err := repo_model.GetUnindexedRepos(ctx, repo_model.RepoIndexerTypeStats, maxRepoID, 0, 50)
if err != nil {
log.Error("populateRepoIndexer: %v", err)
return
} else if len(ids) == 0 {
break
}
for _, id := range ids {
select {
case <-isShutdown:
log.Info("Repository Stats Indexer population shutdown before completion")
return
default:
}
if err := statsQueue.Push(id); err != nil {
log.Error("statsQueue.Push: %v", err)
}
maxRepoID = id - 1
}
}
log.Info("Done (re)populating the repo stats indexer with existing repositories")
}
+49
View File
@@ -0,0 +1,49 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package stats
import (
"testing"
"time"
repo_model "gitea.dev/models/repo"
"gitea.dev/models/unittest"
"gitea.dev/modules/queue"
"gitea.dev/modules/setting"
_ "gitea.dev/models"
_ "gitea.dev/models/actions"
_ "gitea.dev/models/activities"
"github.com/stretchr/testify/assert"
)
func TestMain(m *testing.M) {
unittest.MainTest(m)
}
func TestRepoStatsIndex(t *testing.T) {
assert.NoError(t, unittest.PrepareTestDatabase())
setting.CfgProvider, _ = setting.NewConfigProviderFromData("")
setting.LoadQueueSettings()
err := Init()
assert.NoError(t, err)
repo, err := repo_model.GetRepositoryByID(t.Context(), 1)
assert.NoError(t, err)
err = UpdateRepoIndexer(repo)
assert.NoError(t, err)
assert.NoError(t, queue.GetManager().FlushAll(t.Context(), 5*time.Second))
status, err := repo_model.GetIndexerStatus(t.Context(), repo, repo_model.RepoIndexerTypeStats)
assert.NoError(t, err)
assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha)
langs, err := repo_model.GetTopLanguageStats(t.Context(), repo, 5)
assert.NoError(t, err)
assert.Empty(t, langs)
}
+49
View File
@@ -0,0 +1,49 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package stats
import (
"errors"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/graceful"
"gitea.dev/modules/log"
"gitea.dev/modules/queue"
"gitea.dev/modules/setting"
)
// statsQueue represents a queue to handle repository stats updates
var statsQueue *queue.WorkerPoolQueue[int64]
// handle passed PR IDs and test the PRs
func handler(items ...int64) []int64 {
for _, opts := range items {
if err := indexer.Index(opts); err != nil {
if !setting.IsInTesting {
log.Error("stats queue indexer.Index(%d) failed: %v", opts, err)
}
}
}
return nil
}
func initStatsQueue() error {
statsQueue = queue.CreateUniqueQueue(graceful.GetManager().ShutdownContext(), "repo_stats_update", handler)
if statsQueue == nil {
return errors.New("unable to create repo_stats_update queue")
}
go graceful.GetManager().RunWithCancel(statsQueue)
return nil
}
// UpdateRepoIndexer update a repository's entries in the indexer
func UpdateRepoIndexer(repo *repo_model.Repository) error {
if err := statsQueue.Push(repo.ID); err != nil {
if err != queue.ErrAlreadyInQueue {
return err
}
log.Debug("Repo ID: %d already queued", repo.ID)
}
return nil
}