初始提交: Gitea 项目代码

This commit is contained in:
root
2026-05-30 22:47:36 +08:00
commit f288f76350
6116 changed files with 776822 additions and 0 deletions
+392
View File
@@ -0,0 +1,392 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"context"
"fmt"
"io"
"strconv"
"strings"
"time"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/analyze"
"gitea.dev/modules/charset"
"gitea.dev/modules/git"
"gitea.dev/modules/git/gitcmd"
"gitea.dev/modules/gitrepo"
"gitea.dev/modules/indexer"
path_filter "gitea.dev/modules/indexer/code/bleve/token/path"
"gitea.dev/modules/indexer/code/internal"
indexer_internal "gitea.dev/modules/indexer/internal"
inner_bleve "gitea.dev/modules/indexer/internal/bleve"
"gitea.dev/modules/setting"
"gitea.dev/modules/timeutil"
"gitea.dev/modules/typesniffer"
"gitea.dev/modules/util"
"github.com/blevesearch/bleve/v2"
analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/go-enry/go-enry/v2"
)
const (
unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16
)
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
"type": unicodenorm.Name,
"form": unicodenorm.NFC,
})
}
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
RepoID int64
CommitID string
Content string
Filename string
Language string
UpdatedAt time.Time
}
// Type returns the document type, for bleve's mapping.Classifier interface.
func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 9
)
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
func generateBleveIndexMapping() (mapping.IndexMapping, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
fileNamedMapping := bleve.NewTextFieldMapping()
fileNamedMapping.IncludeInAll = false
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
timeFieldMapping := bleve.NewDateTimeFieldMapping()
timeFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": letter.Name,
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
}); err != nil {
return nil, err
}
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
}); err != nil {
return nil, err
}
mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
return mapping, nil
}
var _ internal.Indexer = &Indexer{}
// Indexer represents a bleve indexer implementation
type Indexer struct {
inner *inner_bleve.Indexer
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWords()
}
// NewIndexer creates a new bleve local indexer
func NewIndexer(indexDir string) *Indexer {
inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
return &Indexer{
Indexer: inner,
inner: inner,
}
}
func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch, commitSha string,
update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
return nil
}
size := update.Size
var err error
if !update.Sized {
var stdout string
stdout, _, err = gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha))
if err != nil {
return err
}
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
return fmt.Errorf("misformatted git cat-file output: %w", err)
}
}
if size > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
}
info, batchReader, err := catFileBatch.QueryContent(update.BlobSha)
if err != nil {
return err
}
fileContents, err := io.ReadAll(io.LimitReader(batchReader, info.Size))
if err != nil {
return err
} else if !typesniffer.DetectContentType(fileContents).IsText() {
// FIXME: UTF-16 files will probably fail here
// Even if the file is not recognized as a "text file", we could still put its name into the indexers to make the filename become searchable, while leave the content to empty.
fileContents = nil
}
if _, err = batchReader.Discard(1); err != nil {
return err
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents)),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
id := internal.FilenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}
// Index indexes the data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
if len(changes.Updates) > 0 {
catfileBatch, err := gitrepo.NewBatch(ctx, repo)
if err != nil {
return err
}
defer catfileBatch.Close()
for _, update := range changes.Updates {
if err := b.addUpdate(ctx, catfileBatch, sha, update, repo, batch); err != nil {
return err
}
}
}
for _, filename := range changes.RemovedFilenames {
if err := b.addDelete(filename, repo, batch); err != nil {
return err
}
}
return batch.Flush()
}
// Delete deletes indexes by ids
func (b *Indexer) Delete(_ context.Context, repoID int64) error {
query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
result, err := b.inner.Indexer.Search(searchRequest)
if err != nil {
return err
}
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
for _, hit := range result.Hits {
if err = batch.Delete(hit.ID); err != nil {
return err
}
}
return batch.Flush()
}
// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var (
indexerQuery query.Query
keywordQuery query.Query
contentQuery query.Query
)
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
pathQuery.FieldVal = "Filename"
pathQuery.SetBoost(10)
searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
if searchMode == indexer.SearchModeExact {
// 1.21 used NewPrefixQuery, but it seems not working well, and later releases changed to NewMatchPhraseQuery
q := bleve.NewMatchPhraseQuery(opts.Keyword)
q.Analyzer = repoIndexerAnalyzer
q.FieldVal = "Content"
contentQuery = q
} else /* words */ {
q := bleve.NewMatchQuery(opts.Keyword)
q.FieldVal = "Content"
q.Analyzer = repoIndexerAnalyzer
if searchMode == indexer.SearchModeFuzzy {
// this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case.
q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
} else {
q.Operator = query.MatchQueryOperatorAnd
}
contentQuery = q
}
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
}
indexerQuery = bleve.NewConjunctionQuery(
bleve.NewDisjunctionQuery(repoQueries...),
keywordQuery,
)
} else {
indexerQuery = keywordQuery
}
// Save for reuse without language filter
facetQuery := indexerQuery
if len(opts.Language) > 0 {
languageQuery := bleve.NewMatchQuery(opts.Language)
languageQuery.FieldVal = "Language"
languageQuery.Analyzer = analyzer_keyword.Name
indexerQuery = bleve.NewConjunctionQuery(
indexerQuery,
languageQuery,
)
}
from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
if len(opts.Language) == 0 {
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
}
searchRequest.SortBy([]string{"-_score", "UpdatedAt"})
result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
if err != nil {
return 0, nil, nil, err
}
total := int64(result.Total)
searchResults := make([]*internal.SearchResult, len(result.Hits))
for i, hit := range result.Hits {
startIndex, endIndex := -1, -1
for _, locations := range hit.Locations["Content"] {
location := locations[0]
locationStart := int(location.Start)
locationEnd := int(location.End)
if startIndex < 0 || locationStart < startIndex {
startIndex = locationStart
}
if endIndex < 0 || locationEnd > endIndex {
endIndex = locationEnd
}
}
if len(hit.Locations["Filename"]) > 0 {
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
}
language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
updatedUnix = timeutil.TimeStamp(t.Unix())
}
searchResults[i] = &internal.SearchResult{
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: internal.FilenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
CommitID: hit.Fields["CommitID"].(string),
UpdatedUnix: updatedUnix,
Language: language,
Color: enry.GetColor(language),
}
}
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
if len(opts.Language) > 0 {
// Use separate query to go get all language counts
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
facetRequest.IncludeLocations = true
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
return 0, nil, nil, err
}
}
languagesFacet := result.Facets["languages"]
for _, term := range languagesFacet.Terms.Terms() {
if len(term.Term) == 0 {
continue
}
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
Language: term.Term,
Color: enry.GetColor(term.Term),
Count: term.Count,
})
}
return total, searchResults, searchResultLanguages, nil
}
@@ -0,0 +1,105 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package path
import (
"slices"
"strings"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const (
Name = "gitea/path"
)
type TokenFilter struct{}
func NewTokenFilter() *TokenFilter {
return &TokenFilter{}
}
func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewTokenFilter(), nil
}
func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
if len(input) == 1 {
// if there is only one token, we don't need to generate the reversed chain
return generatePathTokens(input, false)
}
normal := generatePathTokens(input, false)
reversed := generatePathTokens(input, true)
return append(normal, reversed...)
}
// Generates path tokens from the input tokens.
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
//
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
// to efficiently search for filenames without supplying the fullpath.
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
terms := make([]string, 0, len(input))
longestTerm := 0
if reversed {
slices.Reverse(input)
}
for i := range input {
var sb strings.Builder
sb.Write(input[0].Term)
for j := 1; j < i; j++ {
sb.WriteString("/")
sb.Write(input[j].Term)
}
term := sb.String()
if longestTerm < len(term) {
longestTerm = len(term)
}
terms = append(terms, term)
}
output := make(analysis.TokenStream, 0, len(terms))
for _, term := range terms {
var start, end int
if reversed {
start = 0
end = len(term)
} else {
start = longestTerm - len(term)
end = longestTerm
}
token := analysis.Token{
Position: 1,
Start: start,
End: end,
Type: analysis.AlphaNumeric,
Term: []byte(term),
}
output = append(output, &token)
}
return output
}
func init() {
// FIXME: move it to the bleve's init function, but do not call it in global init
err := registry.RegisterTokenFilter(Name, TokenFilterConstructor)
if err != nil {
panic(err)
}
}
@@ -0,0 +1,76 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package path
import (
"fmt"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/stretchr/testify/assert"
)
type Scenario struct {
Input string
Tokens []string
}
func TestTokenFilter(t *testing.T) {
scenarios := []struct {
Input string
Terms []string
}{
{
Input: "Dockerfile",
Terms: []string{"Dockerfile"},
},
{
Input: "Dockerfile.rootless",
Terms: []string{"Dockerfile.rootless"},
},
{
Input: "a/b/c/Dockerfile.rootless",
Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
},
{
Input: "",
Terms: []string{},
},
}
for _, scenario := range scenarios {
t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
terms := extractTerms(scenario.Input)
assert.Len(t, terms, len(scenario.Terms))
for _, term := range terms {
assert.Contains(t, scenario.Terms, term)
}
})
}
}
func extractTerms(input string) []string {
tokens := tokenize(input)
filteredTokens := filter(tokens)
terms := make([]string, 0, len(filteredTokens))
for _, token := range filteredTokens {
terms = append(terms, string(token.Term))
}
return terms
}
func filter(input analysis.TokenStream) analysis.TokenStream {
filter := NewTokenFilter()
return filter.Filter(input)
}
func tokenize(input string) analysis.TokenStream {
tokenizer := unicode.NewUnicodeTokenizer()
return tokenizer.Tokenize([]byte(input))
}
@@ -0,0 +1,405 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"context"
"fmt"
"io"
"strconv"
"strings"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/analyze"
"gitea.dev/modules/charset"
"gitea.dev/modules/git"
"gitea.dev/modules/git/gitcmd"
"gitea.dev/modules/gitrepo"
"gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/code/internal"
es "gitea.dev/modules/indexer/internal/elasticsearch"
"gitea.dev/modules/json"
"gitea.dev/modules/log"
"gitea.dev/modules/setting"
"gitea.dev/modules/timeutil"
"gitea.dev/modules/typesniffer"
"gitea.dev/modules/util"
"github.com/go-enry/go-enry/v2"
)
const esRepoIndexerLatestVersion = 3
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
*es.Indexer
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWords()
}
// NewIndexer creates a new elasticsearch indexer
func NewIndexer(url, indexerName string) *Indexer {
return &Indexer{Indexer: es.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)}
}
const (
defaultMapping = `{
"settings": {
"analysis": {
"analyzer": {
"content_analyzer": {
"tokenizer": "content_tokenizer",
"filter" : ["lowercase"]
},
"filename_path_analyzer": {
"tokenizer": "path_tokenizer"
},
"reversed_filename_path_analyzer": {
"tokenizer": "reversed_path_tokenizer"
}
},
"tokenizer": {
"content_tokenizer": {
"type": "simple_pattern_split",
"pattern": "[^a-zA-Z0-9]"
},
"path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/"
},
"reversed_path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/",
"reverse": true
}
}
}
},
"mappings": {
"properties": {
"repo_id": {
"type": "long",
"index": true
},
"filename": {
"type": "text",
"term_vector": "with_positions_offsets",
"index": true,
"fields": {
"path": {
"type": "text",
"analyzer": "reversed_filename_path_analyzer"
},
"path_reversed": {
"type": "text",
"analyzer": "filename_path_analyzer"
}
}
},
"content": {
"type": "text",
"term_vector": "with_positions_offsets",
"index": true,
"analyzer": "content_analyzer"
},
"commit_id": {
"type": "keyword",
"index": true
},
"language": {
"type": "keyword",
"index": true
},
"updated_at": {
"type": "long",
"index": true
}
}
}
}`
)
func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]es.BulkOp, error) {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
return nil, nil
}
size := update.Size
var err error
if !update.Sized {
var stdout string
stdout, _, err = gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha))
if err != nil {
return nil, err
}
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
}
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
if size > setting.Indexer.MaxIndexerFileSize {
return []es.BulkOp{es.DeleteOp(id)}, nil
}
info, batchReader, err := catFileBatch.QueryContent(update.BlobSha)
if err != nil {
return nil, err
}
fileContents, err := io.ReadAll(io.LimitReader(batchReader, info.Size))
if err != nil {
return nil, err
} else if !typesniffer.DetectContentType(fileContents).IsText() {
// FIXME: UTF-16 files will probably fail here
return nil, nil
}
if _, err = batchReader.Discard(1); err != nil {
return nil, err
}
return []es.BulkOp{es.IndexOp(id, map[string]any{
"repo_id": repo.ID,
"filename": update.Filename,
"content": string(charset.ToUTF8DropErrors(fileContents)),
"commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
"updated_at": timeutil.TimeStampNow(),
})}, nil
}
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) es.BulkOp {
return es.DeleteOp(internal.FilenameIndexerID(repo.ID, filename))
}
// Index will save the index data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
ops := make([]es.BulkOp, 0)
if len(changes.Updates) > 0 {
batch, err := gitrepo.NewBatch(ctx, repo)
if err != nil {
return err
}
defer batch.Close()
for _, update := range changes.Updates {
updateOps, err := b.addUpdate(ctx, batch, sha, update, repo)
if err != nil {
return err
}
if len(updateOps) > 0 {
ops = append(ops, updateOps...)
}
}
}
for _, filename := range changes.RemovedFilenames {
ops = append(ops, b.addDelete(filename, repo))
}
if len(ops) > 0 {
esBatchSize := 50
for i := 0; i < len(ops); i += esBatchSize {
if err := b.Bulk(ctx, ops[i:min(i+esBatchSize, len(ops))]); err != nil {
return err
}
}
}
return nil
}
// Delete entries by repoId
func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
if err := b.doDelete(ctx, repoID); err != nil {
// Maybe there is a conflict during the delete operation, so we should retry after a refresh
log.Warn("Deletion of entries of repo %v within index %v was erroneous: %v. Trying to refresh index before trying again", repoID, b.VersionedIndexName(), err)
if err := b.Refresh(ctx); err != nil {
return err
}
if err := b.doDelete(ctx, repoID); err != nil {
log.Error("Could not delete entries of repo %v within index %v", repoID, b.VersionedIndexName())
return err
}
}
return nil
}
// Delete entries by repoId
func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
return b.DeleteByQuery(ctx, es.TermsQuery("repo_id", repoID))
}
// contentMatchIndexPos find words positions for start and the following end on content. It will
// return the beginning position of the first start and the ending position of the
// first end following the start string.
// If not found any of the positions, it will return -1, -1.
func contentMatchIndexPos(content, start, end string) (int, int) {
startIdx := strings.Index(content, start)
if startIdx < 0 {
return -1, -1
}
endIdx := strings.Index(content[startIdx+len(start):], end)
if endIdx < 0 {
return -1, -1
}
return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
}
func convertResult(searchResult *es.SearchResponse, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
hits := make([]*internal.SearchResult, 0, pageSize)
for _, hit := range searchResult.Hits {
repoID, fileName := internal.ParseIndexerID(hit.ID)
res := make(map[string]any)
if err := json.Unmarshal(hit.Source, &res); err != nil {
return 0, nil, nil, err
}
// FIXME: There is no way to get the position the keyword on the content currently on the same request.
// So we get it from content, this may made the query slower. See
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
var startIndex, endIndex int
if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
} else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
// now we should find the positions. But how to avoid html content which contains the
// <em> and </em> tags? If elastic search has handled that?
startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
if startIndex == -1 {
panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
}
} else {
panic(fmt.Sprintf("2===%#v", hit.Highlight))
}
language := res["language"].(string)
hits = append(hits, &internal.SearchResult{
RepoID: repoID,
Filename: fileName,
CommitID: res["commit_id"].(string),
Content: res["content"].(string),
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
Language: language,
StartIndex: startIndex,
EndIndex: endIndex,
Color: enry.GetColor(language),
})
}
return searchResult.Total, hits, extractAggs(searchResult), nil
}
func extractAggs(searchResult *es.SearchResponse) []*internal.SearchResultLanguages {
buckets, found := searchResult.Aggregations["language"]
if !found {
return nil
}
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
for _, bucket := range buckets {
// language is mapped as keyword so the key is always a string; if the
// mapping ever changes, skip rather than emit an empty-language bucket.
key, ok := bucket.Key.(string)
if !ok {
continue
}
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
Language: key,
Color: enry.GetColor(key),
Count: int(bucket.DocCount),
})
}
return searchResultLanguages
}
// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
contentQuery := es.Query(es.NewMultiMatchQuery(opts.Keyword, "content").Type(es.MultiMatchTypeBestFields).Operator("and"))
if searchMode == indexer.SearchModeExact {
contentQuery = es.MatchPhraseQuery("content", opts.Keyword)
}
kwQuery := es.NewBoolQuery().Should(
contentQuery,
es.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(es.MultiMatchTypePhrasePrefix),
)
query := es.NewBoolQuery().Must(kwQuery)
if len(opts.RepoIDs) > 0 {
query.Must(es.TermsQuery("repo_id", es.ToAnySlice(opts.RepoIDs)...))
}
start, pageSize := opts.GetSkipTake()
kw := "<em>" + opts.Keyword + "</em>"
languageAggs := map[string]any{
"language": map[string]any{
"terms": map[string]any{
"field": "language",
"size": 10,
"order": map[string]any{"_count": "desc"},
},
},
}
// number_of_fragments=0 returns the full highlighted content (no fragmentation).
highlight := map[string]any{
"fields": map[string]any{
"content": map[string]any{},
"filename": map[string]any{},
},
"number_of_fragments": 0,
"type": "fvh",
}
sort := []es.SortField{
{Field: "_score", Desc: true},
{Field: "updated_at", Desc: false},
}
if len(opts.Language) == 0 {
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Sort: sort,
From: start,
Size: pageSize,
TrackTotal: true,
Aggregations: languageAggs,
Highlight: highlight,
})
if err != nil {
return 0, nil, nil, err
}
return convertResult(resp, kw, pageSize)
}
countResp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Size: 0, // stats only
TrackTotal: true,
Aggregations: languageAggs,
})
if err != nil {
return 0, nil, nil, err
}
query.Must(es.MatchQuery("language", opts.Language))
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Sort: sort,
From: start,
Size: pageSize,
TrackTotal: true,
Highlight: highlight,
})
if err != nil {
return 0, nil, nil, err
}
total, hits, _, err := convertResult(resp, kw, pageSize)
return total, hits, extractAggs(countResp), err
}
@@ -0,0 +1,16 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestIndexPos(t *testing.T) {
startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end")
assert.Equal(t, 11, startIdx)
assert.Equal(t, 15, endIdx)
}
+201
View File
@@ -0,0 +1,201 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package code
import (
"context"
"strconv"
"strings"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/git"
"gitea.dev/modules/git/gitcmd"
"gitea.dev/modules/gitrepo"
"gitea.dev/modules/indexer/code/internal"
"gitea.dev/modules/log"
"gitea.dev/modules/setting"
)
func getDefaultBranchSha(ctx context.Context, repo *repo_model.Repository) (string, error) {
stdout, _, err := gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("show-ref", "-s").AddDynamicArguments(git.BranchPrefix+repo.DefaultBranch))
if err != nil {
return "", err
}
return strings.TrimSpace(stdout), nil
}
// getRepoChanges returns changes to repo since last indexer update
func getRepoChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
status, err := repo_model.GetIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode)
if err != nil {
return nil, err
}
needGenesis := len(status.CommitSha) == 0
if !needGenesis {
hasAncestorCmd := gitcmd.NewCommand("merge-base").AddDynamicArguments(status.CommitSha, revision)
stdout, _, _ := gitrepo.RunCmdString(ctx, repo, hasAncestorCmd) // FIXME: error is not handled
needGenesis = len(stdout) == 0
}
if needGenesis {
return genesisChanges(ctx, repo, revision)
}
return nonGenesisChanges(ctx, repo, revision)
}
func isIndexable(entry *git.TreeEntry) bool {
if !entry.IsRegular() && !entry.IsExecutable() {
return false
}
name := strings.ToLower(entry.Name())
for _, g := range setting.Indexer.ExcludePatterns {
if g.Match(name) {
return false
}
}
for _, g := range setting.Indexer.IncludePatterns {
if g.Match(name) {
return true
}
}
return len(setting.Indexer.IncludePatterns) == 0
}
// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
func parseGitLsTreeOutput(stdout []byte) ([]internal.FileUpdate, error) {
entries, err := git.ParseTreeEntries(stdout)
if err != nil {
return nil, err
}
idxCount := 0
updates := make([]internal.FileUpdate, len(entries))
for _, entry := range entries {
if isIndexable(entry) {
updates[idxCount] = internal.FileUpdate{
Filename: entry.Name(),
BlobSha: entry.ID.String(),
Size: entry.Size(),
Sized: true,
}
idxCount++
}
}
return updates[:idxCount], nil
}
// genesisChanges get changes to add repo to the indexer for the first time
func genesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
var changes internal.RepoChanges
stdout, _, runErr := gitrepo.RunCmdBytes(ctx, repo, gitcmd.NewCommand("ls-tree", "--full-tree", "-l", "-r").AddDynamicArguments(revision))
if runErr != nil {
return nil, runErr
}
var err error
changes.Updates, err = parseGitLsTreeOutput(stdout)
return &changes, err
}
// nonGenesisChanges get changes since the previous indexer update
func nonGenesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
diffCmd := gitcmd.NewCommand("diff", "--name-status").AddDynamicArguments(repo.CodeIndexerStatus.CommitSha, revision)
stdout, _, runErr := gitrepo.RunCmdString(ctx, repo, diffCmd)
if runErr != nil {
// previous commit sha may have been removed by a force push, so
// try rebuilding from scratch
log.Warn("git diff: %v", runErr)
if err := (*globalIndexer.Load()).Delete(ctx, repo.ID); err != nil {
return nil, err
}
return genesisChanges(ctx, repo, revision)
}
var changes internal.RepoChanges
var err error
updatedFilenames := make([]string, 0, 10)
updateChanges := func() error {
cmd := gitcmd.NewCommand("ls-tree", "--full-tree", "-l").AddDynamicArguments(revision).
AddDashesAndList(updatedFilenames...)
lsTreeStdout, _, err := gitrepo.RunCmdBytes(ctx, repo, cmd)
if err != nil {
return err
}
updates, err1 := parseGitLsTreeOutput(lsTreeStdout)
if err1 != nil {
return err1
}
changes.Updates = append(changes.Updates, updates...)
return nil
}
lines := strings.SplitSeq(stdout, "\n")
for line := range lines {
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
}
fields := strings.Split(line, "\t")
if len(fields) < 2 {
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
continue
}
filename := fields[1]
if len(filename) == 0 {
continue
} else if filename[0] == '"' {
filename, err = strconv.Unquote(filename)
if err != nil {
return nil, err
}
}
switch status := fields[0][0]; status {
case 'M', 'A':
updatedFilenames = append(updatedFilenames, filename)
case 'D':
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
case 'R', 'C':
if len(fields) < 3 {
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
continue
}
dest := fields[2]
if len(dest) == 0 {
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
continue
}
if dest[0] == '"' {
dest, err = strconv.Unquote(dest)
if err != nil {
return nil, err
}
}
if status == 'R' {
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
}
updatedFilenames = append(updatedFilenames, dest)
default:
log.Warn("Unrecognized status: %c (line=%s)", status, line)
}
// According to https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/command-line-string-limitation#more-information
// the command line length should less than 8191 characters, assume filepath is 256, then 8191/256 = 31, so we use 30
if len(updatedFilenames) >= 30 {
if err := updateChanges(); err != nil {
return nil, err
}
updatedFilenames = updatedFilenames[0:0]
}
}
if len(updatedFilenames) > 0 {
if err := updateChanges(); err != nil {
return nil, err
}
}
return &changes, err
}
+66
View File
@@ -0,0 +1,66 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package gitgrep
import (
"context"
"fmt"
"strings"
"gitea.dev/modules/git"
"gitea.dev/modules/indexer"
code_indexer "gitea.dev/modules/indexer/code"
"gitea.dev/modules/setting"
)
func indexSettingToGitGrepPathspecList() (list []string) {
for _, expr := range setting.Indexer.IncludePatterns {
list = append(list, ":(glob)"+expr.PatternString())
}
for _, expr := range setting.Indexer.ExcludePatterns {
list = append(list, ":(glob,exclude)"+expr.PatternString())
}
return list
}
func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, searchMode indexer.SearchModeType) (searchResults []*code_indexer.Result, total int64, err error) {
grepMode := git.GrepModeWords
switch searchMode {
case indexer.SearchModeExact:
grepMode = git.GrepModeExact
case indexer.SearchModeRegexp:
grepMode = git.GrepModeRegexp
}
res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{
ContextLineNumber: 1,
GrepMode: grepMode,
RefName: ref.String(),
PathspecList: indexSettingToGitGrepPathspecList(),
})
if err != nil {
// TODO: if no branch exists, it reports: exit status 128, fatal: this operation must be run in a work tree.
return nil, 0, fmt.Errorf("git.GrepSearch: %w", err)
}
commitID, err := gitRepo.GetRefCommitID(ref.String())
if err != nil {
return nil, 0, fmt.Errorf("gitRepo.GetRefCommitID: %w", err)
}
total = int64(len(res))
pageStart := min((page-1)*setting.UI.RepoSearchPagingNum, len(res))
pageEnd := min(page*setting.UI.RepoSearchPagingNum, len(res))
res = res[pageStart:pageEnd]
for _, r := range res {
searchResults = append(searchResults, &code_indexer.Result{
RepoID: repoID,
Filename: r.Filename,
CommitID: commitID,
// UpdatedUnix: not supported yet
// Language: not supported yet
// Color: not supported yet
Lines: code_indexer.HighlightSearchResultCode(r.Filename, "", r.LineNumbers, strings.Join(r.LineCodes, "\n")),
})
}
return searchResults, total, nil
}
@@ -0,0 +1,19 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package gitgrep
import (
"testing"
"gitea.dev/modules/setting"
"gitea.dev/modules/test"
"github.com/stretchr/testify/assert"
)
func TestIndexSettingToGitGrepPathspecList(t *testing.T) {
defer test.MockVariableValue(&setting.Indexer.IncludePatterns, setting.IndexerGlobFromString("a"))()
defer test.MockVariableValue(&setting.Indexer.ExcludePatterns, setting.IndexerGlobFromString("b"))()
assert.Equal(t, []string{":(glob)a", ":(glob,exclude)b"}, indexSettingToGitGrepPathspecList())
}
+314
View File
@@ -0,0 +1,314 @@
// Copyright 2016 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package code
import (
"context"
"os"
"runtime/pprof"
"slices"
"sync/atomic"
"time"
"gitea.dev/models/db"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/graceful"
"gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/code/bleve"
"gitea.dev/modules/indexer/code/elasticsearch"
"gitea.dev/modules/indexer/code/internal"
"gitea.dev/modules/log"
"gitea.dev/modules/process"
"gitea.dev/modules/queue"
"gitea.dev/modules/setting"
"gitea.dev/modules/util"
)
var (
indexerQueue *queue.WorkerPoolQueue[*internal.IndexerData]
// globalIndexer is the global indexer, it cannot be nil.
// When the real indexer is not ready, it will be a dummy indexer which will return error to explain it's not ready.
// So it's always safe use it as *globalIndexer.Load() and call its methods.
globalIndexer atomic.Pointer[internal.Indexer]
)
func init() {
dummyIndexer := internal.NewDummyIndexer()
globalIndexer.Store(&dummyIndexer)
}
func index(ctx context.Context, indexer internal.Indexer, repoID int64) error {
repo, err := repo_model.GetRepositoryByID(ctx, repoID)
if repo_model.IsErrRepoNotExist(err) {
return indexer.Delete(ctx, repoID)
}
if err != nil {
return err
}
repoTypes := setting.Indexer.RepoIndexerRepoTypes
if len(repoTypes) == 0 {
repoTypes = []string{"sources"}
}
// skip forks from being indexed if unit is not present
if !slices.Contains(repoTypes, "forks") && repo.IsFork {
return nil
}
// skip mirrors from being indexed if unit is not present
if !slices.Contains(repoTypes, "mirrors") && repo.IsMirror {
return nil
}
// skip templates from being indexed if unit is not present
if !slices.Contains(repoTypes, "templates") && repo.IsTemplate {
return nil
}
// skip regular repos from being indexed if unit is not present
if !slices.Contains(repoTypes, "sources") && !repo.IsFork && !repo.IsMirror && !repo.IsTemplate {
return nil
}
sha, err := getDefaultBranchSha(ctx, repo)
if err != nil {
return err
}
changes, err := getRepoChanges(ctx, repo, sha)
if err != nil {
return err
} else if changes == nil {
return nil
}
if err := indexer.Index(ctx, repo, sha, changes); err != nil {
return err
}
return repo_model.UpdateIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode, sha)
}
// Init initialize the repo indexer
func Init() {
if !setting.Indexer.RepoIndexerEnabled {
(*globalIndexer.Load()).Close()
return
}
ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), "Service: CodeIndexer", process.SystemProcessType, false)
graceful.GetManager().RunAtTerminate(func() {
select {
case <-ctx.Done():
return
default:
}
cancel()
log.Debug("Closing repository indexer")
(*globalIndexer.Load()).Close()
log.Info("PID: %d Repository Indexer closed", os.Getpid())
finished()
})
waitChannel := make(chan time.Duration, 1)
// Create the Queue
switch setting.Indexer.RepoType {
case "bleve", "elasticsearch":
handler := func(items ...*internal.IndexerData) (unhandled []*internal.IndexerData) {
indexer := *globalIndexer.Load()
for _, indexerData := range items {
log.Trace("IndexerData Process Repo: %d", indexerData.RepoID)
if err := index(ctx, indexer, indexerData.RepoID); err != nil {
if !setting.IsInTesting {
log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err)
}
}
}
return nil // do not re-queue the failed items, otherwise some broken repo will block the queue
}
indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler)
if indexerQueue == nil {
log.Fatal("Unable to create codes indexer queue")
}
default:
log.Fatal("Unknown codes indexer type; %s", setting.Indexer.RepoType)
}
go func() {
pprof.SetGoroutineLabels(ctx)
start := time.Now()
var (
rIndexer internal.Indexer
existed bool
err error
)
switch setting.Indexer.RepoType {
case "bleve":
log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), setting.Indexer.RepoPath)
defer func() {
if err := recover(); err != nil {
log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2))
log.Error("The indexer files are likely corrupted and may need to be deleted")
log.Error("You can completely remove the \"%s\" directory to make Gitea recreate the indexes", setting.Indexer.RepoPath)
}
}()
rIndexer = bleve.NewIndexer(setting.Indexer.RepoPath)
existed, err = rIndexer.Init(ctx)
if err != nil {
cancel()
(*globalIndexer.Load()).Close()
close(waitChannel)
log.Fatal("PID: %d Unable to initialize the bleve Repository Indexer at path: %s Error: %v", os.Getpid(), setting.Indexer.RepoPath, err)
}
case "elasticsearch":
log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr))
defer func() {
if err := recover(); err != nil {
log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2))
log.Error("The indexer files are likely corrupted and may need to be deleted")
log.Error("You can completely remove the \"%s\" index to make Gitea recreate the indexes", util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr))
}
}()
rIndexer = elasticsearch.NewIndexer(setting.Indexer.RepoConnStr, setting.Indexer.RepoIndexerName)
existed, err = rIndexer.Init(ctx)
if err != nil {
cancel()
(*globalIndexer.Load()).Close()
close(waitChannel)
log.Fatal("PID: %d Unable to initialize the elasticsearch Repository Indexer connstr: %s Error: %v", os.Getpid(), util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr), err)
}
default:
log.Fatal("PID: %d Unknown Indexer type: %s", os.Getpid(), setting.Indexer.RepoType)
}
globalIndexer.Store(&rIndexer)
// Start processing the queue
go graceful.GetManager().RunWithCancel(indexerQueue)
if !existed { // populate the index because it's created for the first time
go graceful.GetManager().RunWithShutdownContext(populateRepoIndexer)
}
select {
case waitChannel <- time.Since(start):
case <-graceful.GetManager().IsShutdown():
}
close(waitChannel)
}()
if setting.Indexer.StartupTimeout > 0 {
go func() {
pprof.SetGoroutineLabels(ctx)
timeout := setting.Indexer.StartupTimeout
if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 {
timeout += setting.GracefulHammerTime
}
select {
case <-graceful.GetManager().IsShutdown():
log.Warn("Shutdown before Repository Indexer completed initialization")
cancel()
(*globalIndexer.Load()).Close()
case duration, ok := <-waitChannel:
if !ok {
log.Warn("Repository Indexer Initialization failed")
cancel()
(*globalIndexer.Load()).Close()
return
}
log.Info("Repository Indexer Initialization took %v", duration)
case <-time.After(timeout):
cancel()
(*globalIndexer.Load()).Close()
log.Fatal("Repository Indexer Initialization Timed-Out after: %v", timeout)
}
}()
}
}
// UpdateRepoIndexer update a repository's entries in the indexer
func UpdateRepoIndexer(repo *repo_model.Repository) {
indexData := &internal.IndexerData{RepoID: repo.ID}
if err := indexerQueue.Push(indexData); err != nil {
log.Error("Update repo index data %v failed: %v", indexData, err)
}
}
// IsAvailable checks if issue indexer is available
func IsAvailable(ctx context.Context) bool {
return (*globalIndexer.Load()).Ping(ctx) == nil
}
// populateRepoIndexer populate the repo indexer with pre-existing data. This
// should only be run when the indexer is created for the first time.
func populateRepoIndexer(ctx context.Context) {
log.Info("Populating the repo indexer with existing repositories")
exist, err := db.IsTableNotEmpty("repository")
if err != nil {
log.Fatal("System error: %v", err)
} else if !exist {
return
}
// if there is any existing repo indexer metadata in the DB, delete it
// since we are starting afresh. Also, xorm requires deletes to have a
// condition, and we want to delete everything, thus 1=1.
if err := db.DeleteAllRecords("repo_indexer_status"); err != nil {
log.Fatal("System error: %v", err)
}
var maxRepoID int64
if maxRepoID, err = db.GetMaxID("repository"); err != nil {
log.Fatal("System error: %v", err)
}
// start with the maximum existing repo ID and work backwards, so that we
// don't include repos that are created after gitea starts; such repos will
// already be added to the indexer, and we don't need to add them again.
for maxRepoID > 0 {
select {
case <-ctx.Done():
log.Info("Repository Indexer population shutdown before completion")
return
default:
}
ids, err := repo_model.GetUnindexedRepos(ctx, repo_model.RepoIndexerTypeCode, maxRepoID, 0, 50)
if err != nil {
log.Error("populateRepoIndexer: %v", err)
return
} else if len(ids) == 0 {
break
}
for _, id := range ids {
select {
case <-ctx.Done():
log.Info("Repository Indexer population shutdown before completion")
return
default:
}
if err := indexerQueue.Push(&internal.IndexerData{RepoID: id}); err != nil {
log.Error("indexerQueue.Push: %v", err)
return
}
maxRepoID = id - 1
}
}
log.Info("Done (re)populating the repo indexer with existing repositories")
}
func SupportedSearchModes() []indexer.SearchMode {
gi := globalIndexer.Load()
if gi == nil {
return nil
}
return (*gi).SupportedSearchModes()
}
+352
View File
@@ -0,0 +1,352 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package code
import (
"context"
"os"
"slices"
"testing"
"time"
"gitea.dev/models/db"
"gitea.dev/models/unittest"
indexer_module "gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/code/bleve"
"gitea.dev/modules/indexer/code/elasticsearch"
"gitea.dev/modules/indexer/code/internal"
"gitea.dev/modules/setting"
"gitea.dev/modules/test"
"gitea.dev/modules/util"
_ "gitea.dev/models"
_ "gitea.dev/models/actions"
_ "gitea.dev/models/activities"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type codeSearchResult struct {
Filename string
Content string
}
func TestMain(m *testing.M) {
unittest.MainTest(m)
}
func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
t.Run(name, func(t *testing.T) {
assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer))
// Wait for the index to catch up: ES/OpenSearch make writes visible
// only after a refresh (default interval: 1s). Bleve is synchronous
// and passes on the first iteration.
require.Eventually(t, func() bool {
total, _, _, err := indexer.Search(t.Context(), &internal.SearchOptions{
Keyword: "Description",
Paginator: &db.ListOptions{Page: 1, PageSize: 1},
})
return err == nil && total > 0
}, 10*time.Second, 100*time.Millisecond, "index did not become searchable")
keywords := []struct {
RepoIDs []int64
Keyword string
Langs int
SearchMode indexer_module.SearchModeType
Results []codeSearchResult
}{
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "Description",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the contents of a file within the repo '2'.
// This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "Description",
Langs: 0,
},
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "repo1",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the contents of a file within the repo '2'.
// This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "repo1",
Langs: 0,
},
// Search for a non-existing term.
// This scenario yields no results
{
RepoIDs: nil,
Keyword: "non-exist",
Langs: 0,
},
// Search for an exact match on the contents of a file within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "pineaple",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for an exact match on the filename within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "avocado.md",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for an partial match on the filename within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "avo",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on both the contents and the filenames within the repo '62'.
// This scenario yields two results: the first result is based on the file (cucumber.md) while the second is based on the contents
{
RepoIDs: []int64{62},
Keyword: "cucumber",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "cucumber.md",
Content: "Salad is good for your health",
},
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on the filenames within the repo '62'.
// This scenario yields two results (both are based on filename, the first one is an exact match)
{
RepoIDs: []int64{62},
Keyword: "ham",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "ham.md",
Content: "This is also not cheese",
},
{
Filename: "potato/ham.md",
Content: "This is not cheese",
},
},
},
// Search for matches on the contents of files within the repo '62'.
// This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
{
RepoIDs: []int64{62},
Keyword: "This is not cheese",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "potato/ham.md",
Content: "This is not cheese",
},
{
Filename: "ham.md",
Content: "This is also not cheese",
},
},
},
// Search for matches on the contents of files regardless of case.
{
RepoIDs: nil,
Keyword: "dESCRIPTION",
Langs: 1,
SearchMode: indexer_module.SearchModeFuzzy,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the filename within the repo '62' (case-insensitive).
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "AVOCADO.MD",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on the contents of files when the criteria are an expression.
{
RepoIDs: []int64{62},
Keyword: "console.log",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "example-file.js",
Content: "console.log(\"Hello, World!\")",
},
},
},
// Search for matches on the contents of files when the criteria are parts of an expression.
{
RepoIDs: []int64{62},
Keyword: "log",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "example-file.js",
Content: "console.log(\"Hello, World!\")",
},
},
},
}
for _, kw := range keywords {
t.Run(kw.Keyword, func(t *testing.T) {
total, res, langs, err := indexer.Search(t.Context(), &internal.SearchOptions{
RepoIDs: kw.RepoIDs,
Keyword: kw.Keyword,
SearchMode: util.IfZero(kw.SearchMode, indexer_module.SearchModeWords),
Paginator: &db.ListOptions{
Page: 1,
PageSize: 10,
},
})
require.NoError(t, err)
require.Len(t, langs, kw.Langs)
hits := make([]codeSearchResult, 0, len(res))
if total > 0 {
assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
}
for _, hit := range res {
hits = append(hits, codeSearchResult{
Filename: hit.Filename,
Content: hit.Content,
})
}
lastIndex := -1
for _, expected := range kw.Results {
index := slices.Index(hits, expected)
if index == -1 {
assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
} else if lastIndex > index {
assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
} else {
lastIndex = index
}
}
})
}
assert.NoError(t, tearDownRepositoryIndexes(t.Context(), indexer))
})
}
func TestBleveIndexAndSearch(t *testing.T) {
unittest.PrepareTestEnv(t)
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
dir := t.TempDir()
idx := bleve.NewIndexer(dir)
defer idx.Close()
_, err := idx.Init(t.Context())
require.NoError(t, err)
testIndexer("bleve", t, idx)
}
func TestESIndexAndSearch(t *testing.T) {
unittest.PrepareTestEnv(t)
u := os.Getenv("TEST_INDEXER_CODE_ES_URL")
if u == "" {
t.SkipNow()
return
}
indexer := elasticsearch.NewIndexer(u, "gitea_codes")
if _, err := indexer.Init(t.Context()); err != nil {
if indexer != nil {
indexer.Close()
}
require.NoError(t, err, "Unable to init ES indexer")
}
defer indexer.Close()
testIndexer("elastic_search", t, indexer)
}
func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
for _, repoID := range repositoriesToSearch() {
if err := index(ctx, indexer, repoID); err != nil {
return err
}
}
return nil
}
func tearDownRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
for _, repoID := range repositoriesToSearch() {
if err := indexer.Delete(ctx, repoID); err != nil {
return err
}
}
return nil
}
func repositoriesToSearch() []int64 {
return []int64{1, 62}
}
+60
View File
@@ -0,0 +1,60 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"context"
"errors"
"gitea.dev/models/db"
repo_model "gitea.dev/models/repo"
"gitea.dev/modules/indexer"
"gitea.dev/modules/indexer/internal"
)
// Indexer defines an interface to index and search code contents
type Indexer interface {
internal.Indexer
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
Delete(ctx context.Context, repoID int64) error
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
SupportedSearchModes() []indexer.SearchMode
}
type SearchOptions struct {
RepoIDs []int64
Keyword string
Language string
SearchMode indexer.SearchModeType
db.Paginator
}
// NewDummyIndexer returns a dummy indexer
func NewDummyIndexer() Indexer {
return &dummyIndexer{
Indexer: internal.NewDummyIndexer(),
}
}
type dummyIndexer struct {
internal.Indexer
}
func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode {
return nil
}
func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error {
return errors.New("indexer is not ready")
}
func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
return errors.New("indexer is not ready")
}
func (d *dummyIndexer) Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) {
return 0, nil, nil, errors.New("indexer is not ready")
}
+44
View File
@@ -0,0 +1,44 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import "gitea.dev/modules/timeutil"
type FileUpdate struct {
Filename string
BlobSha string
Size int64
Sized bool
}
// RepoChanges changes (file additions/updates/removals) to a repo
type RepoChanges struct {
Updates []FileUpdate
RemovedFilenames []string
}
// IndexerData represents data stored in the code indexer
type IndexerData struct {
RepoID int64
}
// SearchResult result of performing a search in a repo
type SearchResult struct {
RepoID int64
StartIndex int
EndIndex int
Filename string
Content string
CommitID string
UpdatedUnix timeutil.TimeStamp
Language string
Color string
}
// SearchResultLanguages result of top languages count in search results
type SearchResultLanguages struct {
Language string
Color string
Count int
}
+48
View File
@@ -0,0 +1,48 @@
// Copyright 2023 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"strings"
"gitea.dev/modules/indexer/internal"
"gitea.dev/modules/log"
)
const filenameMatchNumberOfLines = 7 // Copied from GitHub search
func FilenameIndexerID(repoID int64, filename string) string {
return internal.Base36(repoID) + "_" + filename
}
func ParseIndexerID(indexerID string) (int64, string) {
before, after, ok := strings.Cut(indexerID, "_")
if !ok {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
repoID, _ := internal.ParseBase36(before)
return repoID, after
}
func FilenameOfIndexerID(indexerID string) string {
_, after, ok := strings.Cut(indexerID, "_")
if !ok {
log.Error("Unexpected ID in repo indexer: %s", indexerID)
}
return after
}
// FilenameMatchIndexPos returns the boundaries of its first seven lines.
func FilenameMatchIndexPos(content string) (int, int) {
count := 1
for i, c := range content {
if c == '\n' {
count++
if count == filenameMatchNumberOfLines {
return 0, i
}
}
}
return 0, len(content)
}
+153
View File
@@ -0,0 +1,153 @@
// Copyright 2017 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package code
import (
"bytes"
"context"
"html/template"
"strings"
"gitea.dev/modules/highlight"
"gitea.dev/modules/indexer/code/internal"
"gitea.dev/modules/timeutil"
)
// Result a search result to display
type Result struct {
RepoID int64
Filename string
CommitID string
UpdatedUnix timeutil.TimeStamp
Language string
Color string
Lines []*ResultLine
}
type ResultLine struct {
Num int
FormattedContent template.HTML
}
type SearchResultLanguages = internal.SearchResultLanguages
type SearchOptions = internal.SearchOptions
func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) {
startIndex := selectionStartIndex
numLinesBefore := 0
for ; startIndex > 0; startIndex-- {
if content[startIndex-1] == '\n' {
if numLinesBefore == 1 {
break
}
numLinesBefore++
}
}
endIndex := selectionEndIndex
numLinesAfter := 0
for ; endIndex < len(content); endIndex++ {
if content[endIndex] == '\n' {
if numLinesAfter == 1 {
break
}
numLinesAfter++
}
}
return startIndex, endIndex
}
func writeStrings(buf *bytes.Buffer, strs ...string) error {
for _, s := range strs {
_, err := buf.WriteString(s)
if err != nil {
return err
}
}
return nil
}
func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine {
// we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting
lexer := highlight.DetectChromaLexerByFileName(filename, language)
hl := highlight.RenderCodeByLexer(lexer, code)
highlightedLines := highlight.UnsafeSplitHighlightedLines(hl)
// The lineNums outputted by render might not match the original lineNums, because "highlight" removes the last `\n`
lines := make([]*ResultLine, min(len(highlightedLines), len(lineNums)))
for i := range lines {
lines[i] = &ResultLine{
Num: lineNums[i],
FormattedContent: template.HTML(highlightedLines[i]),
}
}
return lines
}
func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Result, error) {
startLineNum := 1 + strings.Count(result.Content[:startIndex], "\n")
var formattedLinesBuffer bytes.Buffer
contentLines := strings.SplitAfter(result.Content[startIndex:endIndex], "\n")
lineNums := make([]int, 0, len(contentLines))
index := startIndex
for i, line := range contentLines {
var err error
if index < result.EndIndex &&
result.StartIndex < index+len(line) &&
result.StartIndex < result.EndIndex {
openActiveIndex := max(result.StartIndex-index, 0)
closeActiveIndex := min(result.EndIndex-index, len(line))
err = writeStrings(&formattedLinesBuffer,
line[:openActiveIndex],
line[openActiveIndex:closeActiveIndex],
line[closeActiveIndex:],
)
} else {
err = writeStrings(&formattedLinesBuffer, line)
}
if err != nil {
return nil, err
}
lineNums = append(lineNums, startLineNum+i)
index += len(line)
}
return &Result{
RepoID: result.RepoID,
Filename: result.Filename,
CommitID: result.CommitID,
UpdatedUnix: result.UpdatedUnix,
Language: result.Language,
Color: result.Color,
Lines: HighlightSearchResultCode(result.Filename, result.Language, lineNums, formattedLinesBuffer.String()),
}, nil
}
// PerformSearch perform a search on a repository
func PerformSearch(ctx context.Context, opts *SearchOptions) (int64, []*Result, []*SearchResultLanguages, error) {
if opts == nil || len(opts.Keyword) == 0 {
return 0, nil, nil, nil
}
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, opts)
if err != nil {
return 0, nil, nil, err
}
displayResults := make([]*Result, len(results))
for i, result := range results {
startIndex, endIndex := indices(result.Content, result.StartIndex, result.EndIndex)
displayResults[i], err = searchResult(result, startIndex, endIndex)
if err != nil {
return 0, nil, nil, err
}
}
return total, displayResults, resultLanguages, nil
}