初始提交: Gitea 项目代码
This commit is contained in:
@@ -0,0 +1,392 @@
|
||||
// Copyright 2019 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package bleve
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
repo_model "gitea.dev/models/repo"
|
||||
"gitea.dev/modules/analyze"
|
||||
"gitea.dev/modules/charset"
|
||||
"gitea.dev/modules/git"
|
||||
"gitea.dev/modules/git/gitcmd"
|
||||
"gitea.dev/modules/gitrepo"
|
||||
"gitea.dev/modules/indexer"
|
||||
path_filter "gitea.dev/modules/indexer/code/bleve/token/path"
|
||||
"gitea.dev/modules/indexer/code/internal"
|
||||
indexer_internal "gitea.dev/modules/indexer/internal"
|
||||
inner_bleve "gitea.dev/modules/indexer/internal/bleve"
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/timeutil"
|
||||
"gitea.dev/modules/typesniffer"
|
||||
"gitea.dev/modules/util"
|
||||
|
||||
"github.com/blevesearch/bleve/v2"
|
||||
analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
|
||||
analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
"github.com/blevesearch/bleve/v2/mapping"
|
||||
"github.com/blevesearch/bleve/v2/search/query"
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
unicodeNormalizeName = "unicodeNormalize"
|
||||
maxBatchSize = 16
|
||||
)
|
||||
|
||||
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
|
||||
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
|
||||
"type": unicodenorm.Name,
|
||||
"form": unicodenorm.NFC,
|
||||
})
|
||||
}
|
||||
|
||||
// RepoIndexerData data stored in the repo indexer
|
||||
type RepoIndexerData struct {
|
||||
RepoID int64
|
||||
CommitID string
|
||||
Content string
|
||||
Filename string
|
||||
Language string
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
// Type returns the document type, for bleve's mapping.Classifier interface.
|
||||
func (d *RepoIndexerData) Type() string {
|
||||
return repoIndexerDocType
|
||||
}
|
||||
|
||||
const (
|
||||
repoIndexerAnalyzer = "repoIndexerAnalyzer"
|
||||
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
|
||||
filenameIndexerTokenizer = "filenameIndexerTokenizer"
|
||||
repoIndexerDocType = "repoIndexerDocType"
|
||||
repoIndexerLatestVersion = 9
|
||||
)
|
||||
|
||||
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
|
||||
func generateBleveIndexMapping() (mapping.IndexMapping, error) {
|
||||
docMapping := bleve.NewDocumentMapping()
|
||||
numericFieldMapping := bleve.NewNumericFieldMapping()
|
||||
numericFieldMapping.IncludeInAll = false
|
||||
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
|
||||
|
||||
textFieldMapping := bleve.NewTextFieldMapping()
|
||||
textFieldMapping.IncludeInAll = false
|
||||
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
|
||||
|
||||
fileNamedMapping := bleve.NewTextFieldMapping()
|
||||
fileNamedMapping.IncludeInAll = false
|
||||
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
|
||||
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
|
||||
|
||||
termFieldMapping := bleve.NewTextFieldMapping()
|
||||
termFieldMapping.IncludeInAll = false
|
||||
termFieldMapping.Analyzer = analyzer_keyword.Name
|
||||
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
|
||||
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
|
||||
|
||||
timeFieldMapping := bleve.NewDateTimeFieldMapping()
|
||||
timeFieldMapping.IncludeInAll = false
|
||||
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
|
||||
|
||||
mapping := bleve.NewIndexMapping()
|
||||
|
||||
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
|
||||
return nil, err
|
||||
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
|
||||
"type": analyzer_custom.Name,
|
||||
"char_filters": []string{},
|
||||
"tokenizer": letter.Name,
|
||||
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
|
||||
"type": analyzer_custom.Name,
|
||||
"char_filters": []string{},
|
||||
"tokenizer": unicode.Name,
|
||||
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mapping.DefaultAnalyzer = repoIndexerAnalyzer
|
||||
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
|
||||
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
|
||||
|
||||
return mapping, nil
|
||||
}
|
||||
|
||||
var _ internal.Indexer = &Indexer{}
|
||||
|
||||
// Indexer represents a bleve indexer implementation
|
||||
type Indexer struct {
|
||||
inner *inner_bleve.Indexer
|
||||
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
|
||||
}
|
||||
|
||||
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
|
||||
return indexer.SearchModesExactWords()
|
||||
}
|
||||
|
||||
// NewIndexer creates a new bleve local indexer
|
||||
func NewIndexer(indexDir string) *Indexer {
|
||||
inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
|
||||
return &Indexer{
|
||||
Indexer: inner,
|
||||
inner: inner,
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch, commitSha string,
|
||||
update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
|
||||
) error {
|
||||
// Ignore vendored files in code search
|
||||
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
|
||||
return nil
|
||||
}
|
||||
|
||||
size := update.Size
|
||||
|
||||
var err error
|
||||
if !update.Sized {
|
||||
var stdout string
|
||||
stdout, _, err = gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
|
||||
return fmt.Errorf("misformatted git cat-file output: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
if size > setting.Indexer.MaxIndexerFileSize {
|
||||
return b.addDelete(update.Filename, repo, batch)
|
||||
}
|
||||
|
||||
info, batchReader, err := catFileBatch.QueryContent(update.BlobSha)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fileContents, err := io.ReadAll(io.LimitReader(batchReader, info.Size))
|
||||
if err != nil {
|
||||
return err
|
||||
} else if !typesniffer.DetectContentType(fileContents).IsText() {
|
||||
// FIXME: UTF-16 files will probably fail here
|
||||
// Even if the file is not recognized as a "text file", we could still put its name into the indexers to make the filename become searchable, while leave the content to empty.
|
||||
fileContents = nil
|
||||
}
|
||||
|
||||
if _, err = batchReader.Discard(1); err != nil {
|
||||
return err
|
||||
}
|
||||
id := internal.FilenameIndexerID(repo.ID, update.Filename)
|
||||
return batch.Index(id, &RepoIndexerData{
|
||||
RepoID: repo.ID,
|
||||
CommitID: commitSha,
|
||||
Filename: update.Filename,
|
||||
Content: string(charset.ToUTF8DropErrors(fileContents)),
|
||||
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
|
||||
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
|
||||
id := internal.FilenameIndexerID(repo.ID, filename)
|
||||
return batch.Delete(id)
|
||||
}
|
||||
|
||||
// Index indexes the data
|
||||
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
|
||||
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
|
||||
if len(changes.Updates) > 0 {
|
||||
catfileBatch, err := gitrepo.NewBatch(ctx, repo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer catfileBatch.Close()
|
||||
|
||||
for _, update := range changes.Updates {
|
||||
if err := b.addUpdate(ctx, catfileBatch, sha, update, repo, batch); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, filename := range changes.RemovedFilenames {
|
||||
if err := b.addDelete(filename, repo, batch); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return batch.Flush()
|
||||
}
|
||||
|
||||
// Delete deletes indexes by ids
|
||||
func (b *Indexer) Delete(_ context.Context, repoID int64) error {
|
||||
query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
|
||||
searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
|
||||
result, err := b.inner.Indexer.Search(searchRequest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
|
||||
for _, hit := range result.Hits {
|
||||
if err = batch.Delete(hit.ID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return batch.Flush()
|
||||
}
|
||||
|
||||
// Search searches for files in the specified repo.
|
||||
// Returns the matching file-paths
|
||||
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
var (
|
||||
indexerQuery query.Query
|
||||
keywordQuery query.Query
|
||||
contentQuery query.Query
|
||||
)
|
||||
|
||||
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
|
||||
pathQuery.FieldVal = "Filename"
|
||||
pathQuery.SetBoost(10)
|
||||
|
||||
searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
|
||||
if searchMode == indexer.SearchModeExact {
|
||||
// 1.21 used NewPrefixQuery, but it seems not working well, and later releases changed to NewMatchPhraseQuery
|
||||
q := bleve.NewMatchPhraseQuery(opts.Keyword)
|
||||
q.Analyzer = repoIndexerAnalyzer
|
||||
q.FieldVal = "Content"
|
||||
contentQuery = q
|
||||
} else /* words */ {
|
||||
q := bleve.NewMatchQuery(opts.Keyword)
|
||||
q.FieldVal = "Content"
|
||||
q.Analyzer = repoIndexerAnalyzer
|
||||
if searchMode == indexer.SearchModeFuzzy {
|
||||
// this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case.
|
||||
q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
|
||||
} else {
|
||||
q.Operator = query.MatchQueryOperatorAnd
|
||||
}
|
||||
contentQuery = q
|
||||
}
|
||||
|
||||
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
|
||||
|
||||
if len(opts.RepoIDs) > 0 {
|
||||
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
|
||||
for _, repoID := range opts.RepoIDs {
|
||||
repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
|
||||
}
|
||||
|
||||
indexerQuery = bleve.NewConjunctionQuery(
|
||||
bleve.NewDisjunctionQuery(repoQueries...),
|
||||
keywordQuery,
|
||||
)
|
||||
} else {
|
||||
indexerQuery = keywordQuery
|
||||
}
|
||||
|
||||
// Save for reuse without language filter
|
||||
facetQuery := indexerQuery
|
||||
if len(opts.Language) > 0 {
|
||||
languageQuery := bleve.NewMatchQuery(opts.Language)
|
||||
languageQuery.FieldVal = "Language"
|
||||
languageQuery.Analyzer = analyzer_keyword.Name
|
||||
|
||||
indexerQuery = bleve.NewConjunctionQuery(
|
||||
indexerQuery,
|
||||
languageQuery,
|
||||
)
|
||||
}
|
||||
|
||||
from, pageSize := opts.GetSkipTake()
|
||||
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
|
||||
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
|
||||
searchRequest.IncludeLocations = true
|
||||
|
||||
if len(opts.Language) == 0 {
|
||||
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
|
||||
}
|
||||
|
||||
searchRequest.SortBy([]string{"-_score", "UpdatedAt"})
|
||||
|
||||
result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
total := int64(result.Total)
|
||||
|
||||
searchResults := make([]*internal.SearchResult, len(result.Hits))
|
||||
for i, hit := range result.Hits {
|
||||
startIndex, endIndex := -1, -1
|
||||
for _, locations := range hit.Locations["Content"] {
|
||||
location := locations[0]
|
||||
locationStart := int(location.Start)
|
||||
locationEnd := int(location.End)
|
||||
if startIndex < 0 || locationStart < startIndex {
|
||||
startIndex = locationStart
|
||||
}
|
||||
if endIndex < 0 || locationEnd > endIndex {
|
||||
endIndex = locationEnd
|
||||
}
|
||||
}
|
||||
if len(hit.Locations["Filename"]) > 0 {
|
||||
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
|
||||
}
|
||||
|
||||
language := hit.Fields["Language"].(string)
|
||||
var updatedUnix timeutil.TimeStamp
|
||||
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
|
||||
updatedUnix = timeutil.TimeStamp(t.Unix())
|
||||
}
|
||||
searchResults[i] = &internal.SearchResult{
|
||||
RepoID: int64(hit.Fields["RepoID"].(float64)),
|
||||
StartIndex: startIndex,
|
||||
EndIndex: endIndex,
|
||||
Filename: internal.FilenameOfIndexerID(hit.ID),
|
||||
Content: hit.Fields["Content"].(string),
|
||||
CommitID: hit.Fields["CommitID"].(string),
|
||||
UpdatedUnix: updatedUnix,
|
||||
Language: language,
|
||||
Color: enry.GetColor(language),
|
||||
}
|
||||
}
|
||||
|
||||
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
|
||||
if len(opts.Language) > 0 {
|
||||
// Use separate query to go get all language counts
|
||||
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
|
||||
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
|
||||
facetRequest.IncludeLocations = true
|
||||
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
|
||||
|
||||
if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
}
|
||||
languagesFacet := result.Facets["languages"]
|
||||
for _, term := range languagesFacet.Terms.Terms() {
|
||||
if len(term.Term) == 0 {
|
||||
continue
|
||||
}
|
||||
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
|
||||
Language: term.Term,
|
||||
Color: enry.GetColor(term.Term),
|
||||
Count: term.Count,
|
||||
})
|
||||
}
|
||||
return total, searchResults, searchResultLanguages, nil
|
||||
}
|
||||
@@ -0,0 +1,105 @@
|
||||
// Copyright 2024 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package path
|
||||
|
||||
import (
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const (
|
||||
Name = "gitea/path"
|
||||
)
|
||||
|
||||
type TokenFilter struct{}
|
||||
|
||||
func NewTokenFilter() *TokenFilter {
|
||||
return &TokenFilter{}
|
||||
}
|
||||
|
||||
func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewTokenFilter(), nil
|
||||
}
|
||||
|
||||
func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
if len(input) == 1 {
|
||||
// if there is only one token, we don't need to generate the reversed chain
|
||||
return generatePathTokens(input, false)
|
||||
}
|
||||
|
||||
normal := generatePathTokens(input, false)
|
||||
reversed := generatePathTokens(input, true)
|
||||
|
||||
return append(normal, reversed...)
|
||||
}
|
||||
|
||||
// Generates path tokens from the input tokens.
|
||||
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
|
||||
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
|
||||
//
|
||||
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
|
||||
// to efficiently search for filenames without supplying the fullpath.
|
||||
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
|
||||
terms := make([]string, 0, len(input))
|
||||
longestTerm := 0
|
||||
|
||||
if reversed {
|
||||
slices.Reverse(input)
|
||||
}
|
||||
|
||||
for i := range input {
|
||||
var sb strings.Builder
|
||||
sb.Write(input[0].Term)
|
||||
|
||||
for j := 1; j < i; j++ {
|
||||
sb.WriteString("/")
|
||||
sb.Write(input[j].Term)
|
||||
}
|
||||
|
||||
term := sb.String()
|
||||
|
||||
if longestTerm < len(term) {
|
||||
longestTerm = len(term)
|
||||
}
|
||||
|
||||
terms = append(terms, term)
|
||||
}
|
||||
|
||||
output := make(analysis.TokenStream, 0, len(terms))
|
||||
|
||||
for _, term := range terms {
|
||||
var start, end int
|
||||
|
||||
if reversed {
|
||||
start = 0
|
||||
end = len(term)
|
||||
} else {
|
||||
start = longestTerm - len(term)
|
||||
end = longestTerm
|
||||
}
|
||||
|
||||
token := analysis.Token{
|
||||
Position: 1,
|
||||
Start: start,
|
||||
End: end,
|
||||
Type: analysis.AlphaNumeric,
|
||||
Term: []byte(term),
|
||||
}
|
||||
|
||||
output = append(output, &token)
|
||||
}
|
||||
|
||||
return output
|
||||
}
|
||||
|
||||
func init() {
|
||||
// FIXME: move it to the bleve's init function, but do not call it in global init
|
||||
err := registry.RegisterTokenFilter(Name, TokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
// Copyright 2024 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package path
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
type Scenario struct {
|
||||
Input string
|
||||
Tokens []string
|
||||
}
|
||||
|
||||
func TestTokenFilter(t *testing.T) {
|
||||
scenarios := []struct {
|
||||
Input string
|
||||
Terms []string
|
||||
}{
|
||||
{
|
||||
Input: "Dockerfile",
|
||||
Terms: []string{"Dockerfile"},
|
||||
},
|
||||
{
|
||||
Input: "Dockerfile.rootless",
|
||||
Terms: []string{"Dockerfile.rootless"},
|
||||
},
|
||||
{
|
||||
Input: "a/b/c/Dockerfile.rootless",
|
||||
Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
|
||||
},
|
||||
{
|
||||
Input: "",
|
||||
Terms: []string{},
|
||||
},
|
||||
}
|
||||
|
||||
for _, scenario := range scenarios {
|
||||
t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
|
||||
terms := extractTerms(scenario.Input)
|
||||
|
||||
assert.Len(t, terms, len(scenario.Terms))
|
||||
|
||||
for _, term := range terms {
|
||||
assert.Contains(t, scenario.Terms, term)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func extractTerms(input string) []string {
|
||||
tokens := tokenize(input)
|
||||
filteredTokens := filter(tokens)
|
||||
terms := make([]string, 0, len(filteredTokens))
|
||||
|
||||
for _, token := range filteredTokens {
|
||||
terms = append(terms, string(token.Term))
|
||||
}
|
||||
|
||||
return terms
|
||||
}
|
||||
|
||||
func filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
filter := NewTokenFilter()
|
||||
return filter.Filter(input)
|
||||
}
|
||||
|
||||
func tokenize(input string) analysis.TokenStream {
|
||||
tokenizer := unicode.NewUnicodeTokenizer()
|
||||
return tokenizer.Tokenize([]byte(input))
|
||||
}
|
||||
@@ -0,0 +1,405 @@
|
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package elasticsearch
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
repo_model "gitea.dev/models/repo"
|
||||
"gitea.dev/modules/analyze"
|
||||
"gitea.dev/modules/charset"
|
||||
"gitea.dev/modules/git"
|
||||
"gitea.dev/modules/git/gitcmd"
|
||||
"gitea.dev/modules/gitrepo"
|
||||
"gitea.dev/modules/indexer"
|
||||
"gitea.dev/modules/indexer/code/internal"
|
||||
es "gitea.dev/modules/indexer/internal/elasticsearch"
|
||||
"gitea.dev/modules/json"
|
||||
"gitea.dev/modules/log"
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/timeutil"
|
||||
"gitea.dev/modules/typesniffer"
|
||||
"gitea.dev/modules/util"
|
||||
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
)
|
||||
|
||||
const esRepoIndexerLatestVersion = 3
|
||||
|
||||
var _ internal.Indexer = &Indexer{}
|
||||
|
||||
// Indexer implements Indexer interface
|
||||
type Indexer struct {
|
||||
*es.Indexer
|
||||
}
|
||||
|
||||
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
|
||||
return indexer.SearchModesExactWords()
|
||||
}
|
||||
|
||||
// NewIndexer creates a new elasticsearch indexer
|
||||
func NewIndexer(url, indexerName string) *Indexer {
|
||||
return &Indexer{Indexer: es.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)}
|
||||
}
|
||||
|
||||
const (
|
||||
defaultMapping = `{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"content_analyzer": {
|
||||
"tokenizer": "content_tokenizer",
|
||||
"filter" : ["lowercase"]
|
||||
},
|
||||
"filename_path_analyzer": {
|
||||
"tokenizer": "path_tokenizer"
|
||||
},
|
||||
"reversed_filename_path_analyzer": {
|
||||
"tokenizer": "reversed_path_tokenizer"
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"content_tokenizer": {
|
||||
"type": "simple_pattern_split",
|
||||
"pattern": "[^a-zA-Z0-9]"
|
||||
},
|
||||
"path_tokenizer": {
|
||||
"type": "path_hierarchy",
|
||||
"delimiter": "/"
|
||||
},
|
||||
"reversed_path_tokenizer": {
|
||||
"type": "path_hierarchy",
|
||||
"delimiter": "/",
|
||||
"reverse": true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"repo_id": {
|
||||
"type": "long",
|
||||
"index": true
|
||||
},
|
||||
"filename": {
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets",
|
||||
"index": true,
|
||||
"fields": {
|
||||
"path": {
|
||||
"type": "text",
|
||||
"analyzer": "reversed_filename_path_analyzer"
|
||||
},
|
||||
"path_reversed": {
|
||||
"type": "text",
|
||||
"analyzer": "filename_path_analyzer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"content": {
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets",
|
||||
"index": true,
|
||||
"analyzer": "content_analyzer"
|
||||
},
|
||||
"commit_id": {
|
||||
"type": "keyword",
|
||||
"index": true
|
||||
},
|
||||
"language": {
|
||||
"type": "keyword",
|
||||
"index": true
|
||||
},
|
||||
"updated_at": {
|
||||
"type": "long",
|
||||
"index": true
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
)
|
||||
|
||||
func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]es.BulkOp, error) {
|
||||
// Ignore vendored files in code search
|
||||
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
size := update.Size
|
||||
var err error
|
||||
if !update.Sized {
|
||||
var stdout string
|
||||
stdout, _, err = gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
|
||||
return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
id := internal.FilenameIndexerID(repo.ID, update.Filename)
|
||||
if size > setting.Indexer.MaxIndexerFileSize {
|
||||
return []es.BulkOp{es.DeleteOp(id)}, nil
|
||||
}
|
||||
|
||||
info, batchReader, err := catFileBatch.QueryContent(update.BlobSha)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fileContents, err := io.ReadAll(io.LimitReader(batchReader, info.Size))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if !typesniffer.DetectContentType(fileContents).IsText() {
|
||||
// FIXME: UTF-16 files will probably fail here
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if _, err = batchReader.Discard(1); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return []es.BulkOp{es.IndexOp(id, map[string]any{
|
||||
"repo_id": repo.ID,
|
||||
"filename": update.Filename,
|
||||
"content": string(charset.ToUTF8DropErrors(fileContents)),
|
||||
"commit_id": sha,
|
||||
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
|
||||
"updated_at": timeutil.TimeStampNow(),
|
||||
})}, nil
|
||||
}
|
||||
|
||||
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) es.BulkOp {
|
||||
return es.DeleteOp(internal.FilenameIndexerID(repo.ID, filename))
|
||||
}
|
||||
|
||||
// Index will save the index data
|
||||
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
|
||||
ops := make([]es.BulkOp, 0)
|
||||
if len(changes.Updates) > 0 {
|
||||
batch, err := gitrepo.NewBatch(ctx, repo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer batch.Close()
|
||||
|
||||
for _, update := range changes.Updates {
|
||||
updateOps, err := b.addUpdate(ctx, batch, sha, update, repo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(updateOps) > 0 {
|
||||
ops = append(ops, updateOps...)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, filename := range changes.RemovedFilenames {
|
||||
ops = append(ops, b.addDelete(filename, repo))
|
||||
}
|
||||
|
||||
if len(ops) > 0 {
|
||||
esBatchSize := 50
|
||||
|
||||
for i := 0; i < len(ops); i += esBatchSize {
|
||||
if err := b.Bulk(ctx, ops[i:min(i+esBatchSize, len(ops))]); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete entries by repoId
|
||||
func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
|
||||
if err := b.doDelete(ctx, repoID); err != nil {
|
||||
// Maybe there is a conflict during the delete operation, so we should retry after a refresh
|
||||
log.Warn("Deletion of entries of repo %v within index %v was erroneous: %v. Trying to refresh index before trying again", repoID, b.VersionedIndexName(), err)
|
||||
if err := b.Refresh(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := b.doDelete(ctx, repoID); err != nil {
|
||||
log.Error("Could not delete entries of repo %v within index %v", repoID, b.VersionedIndexName())
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete entries by repoId
|
||||
func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
|
||||
return b.DeleteByQuery(ctx, es.TermsQuery("repo_id", repoID))
|
||||
}
|
||||
|
||||
// contentMatchIndexPos find words positions for start and the following end on content. It will
|
||||
// return the beginning position of the first start and the ending position of the
|
||||
// first end following the start string.
|
||||
// If not found any of the positions, it will return -1, -1.
|
||||
func contentMatchIndexPos(content, start, end string) (int, int) {
|
||||
startIdx := strings.Index(content, start)
|
||||
if startIdx < 0 {
|
||||
return -1, -1
|
||||
}
|
||||
endIdx := strings.Index(content[startIdx+len(start):], end)
|
||||
if endIdx < 0 {
|
||||
return -1, -1
|
||||
}
|
||||
return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
|
||||
}
|
||||
|
||||
func convertResult(searchResult *es.SearchResponse, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
hits := make([]*internal.SearchResult, 0, pageSize)
|
||||
for _, hit := range searchResult.Hits {
|
||||
repoID, fileName := internal.ParseIndexerID(hit.ID)
|
||||
res := make(map[string]any)
|
||||
if err := json.Unmarshal(hit.Source, &res); err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
// FIXME: There is no way to get the position the keyword on the content currently on the same request.
|
||||
// So we get it from content, this may made the query slower. See
|
||||
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
|
||||
var startIndex, endIndex int
|
||||
if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
|
||||
startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
|
||||
} else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
|
||||
// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
|
||||
// now we should find the positions. But how to avoid html content which contains the
|
||||
// <em> and </em> tags? If elastic search has handled that?
|
||||
startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
|
||||
if startIndex == -1 {
|
||||
panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
|
||||
}
|
||||
} else {
|
||||
panic(fmt.Sprintf("2===%#v", hit.Highlight))
|
||||
}
|
||||
|
||||
language := res["language"].(string)
|
||||
|
||||
hits = append(hits, &internal.SearchResult{
|
||||
RepoID: repoID,
|
||||
Filename: fileName,
|
||||
CommitID: res["commit_id"].(string),
|
||||
Content: res["content"].(string),
|
||||
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
|
||||
Language: language,
|
||||
StartIndex: startIndex,
|
||||
EndIndex: endIndex,
|
||||
Color: enry.GetColor(language),
|
||||
})
|
||||
}
|
||||
|
||||
return searchResult.Total, hits, extractAggs(searchResult), nil
|
||||
}
|
||||
|
||||
func extractAggs(searchResult *es.SearchResponse) []*internal.SearchResultLanguages {
|
||||
buckets, found := searchResult.Aggregations["language"]
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
|
||||
for _, bucket := range buckets {
|
||||
// language is mapped as keyword so the key is always a string; if the
|
||||
// mapping ever changes, skip rather than emit an empty-language bucket.
|
||||
key, ok := bucket.Key.(string)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
|
||||
Language: key,
|
||||
Color: enry.GetColor(key),
|
||||
Count: int(bucket.DocCount),
|
||||
})
|
||||
}
|
||||
return searchResultLanguages
|
||||
}
|
||||
|
||||
// Search searches for codes and language stats by given conditions.
|
||||
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
|
||||
contentQuery := es.Query(es.NewMultiMatchQuery(opts.Keyword, "content").Type(es.MultiMatchTypeBestFields).Operator("and"))
|
||||
if searchMode == indexer.SearchModeExact {
|
||||
contentQuery = es.MatchPhraseQuery("content", opts.Keyword)
|
||||
}
|
||||
kwQuery := es.NewBoolQuery().Should(
|
||||
contentQuery,
|
||||
es.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(es.MultiMatchTypePhrasePrefix),
|
||||
)
|
||||
query := es.NewBoolQuery().Must(kwQuery)
|
||||
if len(opts.RepoIDs) > 0 {
|
||||
query.Must(es.TermsQuery("repo_id", es.ToAnySlice(opts.RepoIDs)...))
|
||||
}
|
||||
|
||||
start, pageSize := opts.GetSkipTake()
|
||||
kw := "<em>" + opts.Keyword + "</em>"
|
||||
languageAggs := map[string]any{
|
||||
"language": map[string]any{
|
||||
"terms": map[string]any{
|
||||
"field": "language",
|
||||
"size": 10,
|
||||
"order": map[string]any{"_count": "desc"},
|
||||
},
|
||||
},
|
||||
}
|
||||
// number_of_fragments=0 returns the full highlighted content (no fragmentation).
|
||||
highlight := map[string]any{
|
||||
"fields": map[string]any{
|
||||
"content": map[string]any{},
|
||||
"filename": map[string]any{},
|
||||
},
|
||||
"number_of_fragments": 0,
|
||||
"type": "fvh",
|
||||
}
|
||||
sort := []es.SortField{
|
||||
{Field: "_score", Desc: true},
|
||||
{Field: "updated_at", Desc: false},
|
||||
}
|
||||
|
||||
if len(opts.Language) == 0 {
|
||||
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
|
||||
Query: query,
|
||||
Sort: sort,
|
||||
From: start,
|
||||
Size: pageSize,
|
||||
TrackTotal: true,
|
||||
Aggregations: languageAggs,
|
||||
Highlight: highlight,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
return convertResult(resp, kw, pageSize)
|
||||
}
|
||||
|
||||
countResp, err := b.Indexer.Search(ctx, es.SearchRequest{
|
||||
Query: query,
|
||||
Size: 0, // stats only
|
||||
TrackTotal: true,
|
||||
Aggregations: languageAggs,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
query.Must(es.MatchQuery("language", opts.Language))
|
||||
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
|
||||
Query: query,
|
||||
Sort: sort,
|
||||
From: start,
|
||||
Size: pageSize,
|
||||
TrackTotal: true,
|
||||
Highlight: highlight,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
total, hits, _, err := convertResult(resp, kw, pageSize)
|
||||
return total, hits, extractAggs(countResp), err
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package elasticsearch
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestIndexPos(t *testing.T) {
|
||||
startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end")
|
||||
assert.Equal(t, 11, startIdx)
|
||||
assert.Equal(t, 15, endIdx)
|
||||
}
|
||||
@@ -0,0 +1,201 @@
|
||||
// Copyright 2019 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package code
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
repo_model "gitea.dev/models/repo"
|
||||
"gitea.dev/modules/git"
|
||||
"gitea.dev/modules/git/gitcmd"
|
||||
"gitea.dev/modules/gitrepo"
|
||||
"gitea.dev/modules/indexer/code/internal"
|
||||
"gitea.dev/modules/log"
|
||||
"gitea.dev/modules/setting"
|
||||
)
|
||||
|
||||
func getDefaultBranchSha(ctx context.Context, repo *repo_model.Repository) (string, error) {
|
||||
stdout, _, err := gitrepo.RunCmdString(ctx, repo, gitcmd.NewCommand("show-ref", "-s").AddDynamicArguments(git.BranchPrefix+repo.DefaultBranch))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(stdout), nil
|
||||
}
|
||||
|
||||
// getRepoChanges returns changes to repo since last indexer update
|
||||
func getRepoChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
|
||||
status, err := repo_model.GetIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
needGenesis := len(status.CommitSha) == 0
|
||||
if !needGenesis {
|
||||
hasAncestorCmd := gitcmd.NewCommand("merge-base").AddDynamicArguments(status.CommitSha, revision)
|
||||
stdout, _, _ := gitrepo.RunCmdString(ctx, repo, hasAncestorCmd) // FIXME: error is not handled
|
||||
needGenesis = len(stdout) == 0
|
||||
}
|
||||
|
||||
if needGenesis {
|
||||
return genesisChanges(ctx, repo, revision)
|
||||
}
|
||||
return nonGenesisChanges(ctx, repo, revision)
|
||||
}
|
||||
|
||||
func isIndexable(entry *git.TreeEntry) bool {
|
||||
if !entry.IsRegular() && !entry.IsExecutable() {
|
||||
return false
|
||||
}
|
||||
name := strings.ToLower(entry.Name())
|
||||
for _, g := range setting.Indexer.ExcludePatterns {
|
||||
if g.Match(name) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
for _, g := range setting.Indexer.IncludePatterns {
|
||||
if g.Match(name) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return len(setting.Indexer.IncludePatterns) == 0
|
||||
}
|
||||
|
||||
// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
|
||||
func parseGitLsTreeOutput(stdout []byte) ([]internal.FileUpdate, error) {
|
||||
entries, err := git.ParseTreeEntries(stdout)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
idxCount := 0
|
||||
updates := make([]internal.FileUpdate, len(entries))
|
||||
for _, entry := range entries {
|
||||
if isIndexable(entry) {
|
||||
updates[idxCount] = internal.FileUpdate{
|
||||
Filename: entry.Name(),
|
||||
BlobSha: entry.ID.String(),
|
||||
Size: entry.Size(),
|
||||
Sized: true,
|
||||
}
|
||||
idxCount++
|
||||
}
|
||||
}
|
||||
return updates[:idxCount], nil
|
||||
}
|
||||
|
||||
// genesisChanges get changes to add repo to the indexer for the first time
|
||||
func genesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
|
||||
var changes internal.RepoChanges
|
||||
stdout, _, runErr := gitrepo.RunCmdBytes(ctx, repo, gitcmd.NewCommand("ls-tree", "--full-tree", "-l", "-r").AddDynamicArguments(revision))
|
||||
if runErr != nil {
|
||||
return nil, runErr
|
||||
}
|
||||
|
||||
var err error
|
||||
changes.Updates, err = parseGitLsTreeOutput(stdout)
|
||||
return &changes, err
|
||||
}
|
||||
|
||||
// nonGenesisChanges get changes since the previous indexer update
|
||||
func nonGenesisChanges(ctx context.Context, repo *repo_model.Repository, revision string) (*internal.RepoChanges, error) {
|
||||
diffCmd := gitcmd.NewCommand("diff", "--name-status").AddDynamicArguments(repo.CodeIndexerStatus.CommitSha, revision)
|
||||
stdout, _, runErr := gitrepo.RunCmdString(ctx, repo, diffCmd)
|
||||
if runErr != nil {
|
||||
// previous commit sha may have been removed by a force push, so
|
||||
// try rebuilding from scratch
|
||||
log.Warn("git diff: %v", runErr)
|
||||
if err := (*globalIndexer.Load()).Delete(ctx, repo.ID); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return genesisChanges(ctx, repo, revision)
|
||||
}
|
||||
|
||||
var changes internal.RepoChanges
|
||||
var err error
|
||||
updatedFilenames := make([]string, 0, 10)
|
||||
|
||||
updateChanges := func() error {
|
||||
cmd := gitcmd.NewCommand("ls-tree", "--full-tree", "-l").AddDynamicArguments(revision).
|
||||
AddDashesAndList(updatedFilenames...)
|
||||
lsTreeStdout, _, err := gitrepo.RunCmdBytes(ctx, repo, cmd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
updates, err1 := parseGitLsTreeOutput(lsTreeStdout)
|
||||
if err1 != nil {
|
||||
return err1
|
||||
}
|
||||
changes.Updates = append(changes.Updates, updates...)
|
||||
return nil
|
||||
}
|
||||
lines := strings.SplitSeq(stdout, "\n")
|
||||
for line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if len(line) == 0 {
|
||||
continue
|
||||
}
|
||||
fields := strings.Split(line, "\t")
|
||||
if len(fields) < 2 {
|
||||
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
|
||||
continue
|
||||
}
|
||||
filename := fields[1]
|
||||
if len(filename) == 0 {
|
||||
continue
|
||||
} else if filename[0] == '"' {
|
||||
filename, err = strconv.Unquote(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
switch status := fields[0][0]; status {
|
||||
case 'M', 'A':
|
||||
updatedFilenames = append(updatedFilenames, filename)
|
||||
case 'D':
|
||||
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
|
||||
case 'R', 'C':
|
||||
if len(fields) < 3 {
|
||||
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
|
||||
continue
|
||||
}
|
||||
dest := fields[2]
|
||||
if len(dest) == 0 {
|
||||
log.Warn("Unparseable output for diff --name-status: `%s`)", line)
|
||||
continue
|
||||
}
|
||||
if dest[0] == '"' {
|
||||
dest, err = strconv.Unquote(dest)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if status == 'R' {
|
||||
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
|
||||
}
|
||||
updatedFilenames = append(updatedFilenames, dest)
|
||||
default:
|
||||
log.Warn("Unrecognized status: %c (line=%s)", status, line)
|
||||
}
|
||||
|
||||
// According to https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/command-line-string-limitation#more-information
|
||||
// the command line length should less than 8191 characters, assume filepath is 256, then 8191/256 = 31, so we use 30
|
||||
if len(updatedFilenames) >= 30 {
|
||||
if err := updateChanges(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
updatedFilenames = updatedFilenames[0:0]
|
||||
}
|
||||
}
|
||||
|
||||
if len(updatedFilenames) > 0 {
|
||||
if err := updateChanges(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return &changes, err
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
// Copyright 2025 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package gitgrep
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"gitea.dev/modules/git"
|
||||
"gitea.dev/modules/indexer"
|
||||
code_indexer "gitea.dev/modules/indexer/code"
|
||||
"gitea.dev/modules/setting"
|
||||
)
|
||||
|
||||
func indexSettingToGitGrepPathspecList() (list []string) {
|
||||
for _, expr := range setting.Indexer.IncludePatterns {
|
||||
list = append(list, ":(glob)"+expr.PatternString())
|
||||
}
|
||||
for _, expr := range setting.Indexer.ExcludePatterns {
|
||||
list = append(list, ":(glob,exclude)"+expr.PatternString())
|
||||
}
|
||||
return list
|
||||
}
|
||||
|
||||
func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, searchMode indexer.SearchModeType) (searchResults []*code_indexer.Result, total int64, err error) {
|
||||
grepMode := git.GrepModeWords
|
||||
switch searchMode {
|
||||
case indexer.SearchModeExact:
|
||||
grepMode = git.GrepModeExact
|
||||
case indexer.SearchModeRegexp:
|
||||
grepMode = git.GrepModeRegexp
|
||||
}
|
||||
res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{
|
||||
ContextLineNumber: 1,
|
||||
GrepMode: grepMode,
|
||||
RefName: ref.String(),
|
||||
PathspecList: indexSettingToGitGrepPathspecList(),
|
||||
})
|
||||
if err != nil {
|
||||
// TODO: if no branch exists, it reports: exit status 128, fatal: this operation must be run in a work tree.
|
||||
return nil, 0, fmt.Errorf("git.GrepSearch: %w", err)
|
||||
}
|
||||
commitID, err := gitRepo.GetRefCommitID(ref.String())
|
||||
if err != nil {
|
||||
return nil, 0, fmt.Errorf("gitRepo.GetRefCommitID: %w", err)
|
||||
}
|
||||
|
||||
total = int64(len(res))
|
||||
pageStart := min((page-1)*setting.UI.RepoSearchPagingNum, len(res))
|
||||
pageEnd := min(page*setting.UI.RepoSearchPagingNum, len(res))
|
||||
res = res[pageStart:pageEnd]
|
||||
for _, r := range res {
|
||||
searchResults = append(searchResults, &code_indexer.Result{
|
||||
RepoID: repoID,
|
||||
Filename: r.Filename,
|
||||
CommitID: commitID,
|
||||
// UpdatedUnix: not supported yet
|
||||
// Language: not supported yet
|
||||
// Color: not supported yet
|
||||
Lines: code_indexer.HighlightSearchResultCode(r.Filename, "", r.LineNumbers, strings.Join(r.LineCodes, "\n")),
|
||||
})
|
||||
}
|
||||
return searchResults, total, nil
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
// Copyright 2024 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package gitgrep
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/test"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestIndexSettingToGitGrepPathspecList(t *testing.T) {
|
||||
defer test.MockVariableValue(&setting.Indexer.IncludePatterns, setting.IndexerGlobFromString("a"))()
|
||||
defer test.MockVariableValue(&setting.Indexer.ExcludePatterns, setting.IndexerGlobFromString("b"))()
|
||||
assert.Equal(t, []string{":(glob)a", ":(glob,exclude)b"}, indexSettingToGitGrepPathspecList())
|
||||
}
|
||||
@@ -0,0 +1,314 @@
|
||||
// Copyright 2016 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package code
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"runtime/pprof"
|
||||
"slices"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"gitea.dev/models/db"
|
||||
repo_model "gitea.dev/models/repo"
|
||||
"gitea.dev/modules/graceful"
|
||||
"gitea.dev/modules/indexer"
|
||||
"gitea.dev/modules/indexer/code/bleve"
|
||||
"gitea.dev/modules/indexer/code/elasticsearch"
|
||||
"gitea.dev/modules/indexer/code/internal"
|
||||
"gitea.dev/modules/log"
|
||||
"gitea.dev/modules/process"
|
||||
"gitea.dev/modules/queue"
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/util"
|
||||
)
|
||||
|
||||
var (
|
||||
indexerQueue *queue.WorkerPoolQueue[*internal.IndexerData]
|
||||
// globalIndexer is the global indexer, it cannot be nil.
|
||||
// When the real indexer is not ready, it will be a dummy indexer which will return error to explain it's not ready.
|
||||
// So it's always safe use it as *globalIndexer.Load() and call its methods.
|
||||
globalIndexer atomic.Pointer[internal.Indexer]
|
||||
)
|
||||
|
||||
func init() {
|
||||
dummyIndexer := internal.NewDummyIndexer()
|
||||
globalIndexer.Store(&dummyIndexer)
|
||||
}
|
||||
|
||||
func index(ctx context.Context, indexer internal.Indexer, repoID int64) error {
|
||||
repo, err := repo_model.GetRepositoryByID(ctx, repoID)
|
||||
if repo_model.IsErrRepoNotExist(err) {
|
||||
return indexer.Delete(ctx, repoID)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
repoTypes := setting.Indexer.RepoIndexerRepoTypes
|
||||
|
||||
if len(repoTypes) == 0 {
|
||||
repoTypes = []string{"sources"}
|
||||
}
|
||||
|
||||
// skip forks from being indexed if unit is not present
|
||||
if !slices.Contains(repoTypes, "forks") && repo.IsFork {
|
||||
return nil
|
||||
}
|
||||
|
||||
// skip mirrors from being indexed if unit is not present
|
||||
if !slices.Contains(repoTypes, "mirrors") && repo.IsMirror {
|
||||
return nil
|
||||
}
|
||||
|
||||
// skip templates from being indexed if unit is not present
|
||||
if !slices.Contains(repoTypes, "templates") && repo.IsTemplate {
|
||||
return nil
|
||||
}
|
||||
|
||||
// skip regular repos from being indexed if unit is not present
|
||||
if !slices.Contains(repoTypes, "sources") && !repo.IsFork && !repo.IsMirror && !repo.IsTemplate {
|
||||
return nil
|
||||
}
|
||||
|
||||
sha, err := getDefaultBranchSha(ctx, repo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
changes, err := getRepoChanges(ctx, repo, sha)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if changes == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := indexer.Index(ctx, repo, sha, changes); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return repo_model.UpdateIndexerStatus(ctx, repo, repo_model.RepoIndexerTypeCode, sha)
|
||||
}
|
||||
|
||||
// Init initialize the repo indexer
|
||||
func Init() {
|
||||
if !setting.Indexer.RepoIndexerEnabled {
|
||||
(*globalIndexer.Load()).Close()
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel, finished := process.GetManager().AddTypedContext(context.Background(), "Service: CodeIndexer", process.SystemProcessType, false)
|
||||
|
||||
graceful.GetManager().RunAtTerminate(func() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
cancel()
|
||||
log.Debug("Closing repository indexer")
|
||||
(*globalIndexer.Load()).Close()
|
||||
log.Info("PID: %d Repository Indexer closed", os.Getpid())
|
||||
finished()
|
||||
})
|
||||
|
||||
waitChannel := make(chan time.Duration, 1)
|
||||
|
||||
// Create the Queue
|
||||
switch setting.Indexer.RepoType {
|
||||
case "bleve", "elasticsearch":
|
||||
handler := func(items ...*internal.IndexerData) (unhandled []*internal.IndexerData) {
|
||||
indexer := *globalIndexer.Load()
|
||||
for _, indexerData := range items {
|
||||
log.Trace("IndexerData Process Repo: %d", indexerData.RepoID)
|
||||
if err := index(ctx, indexer, indexerData.RepoID); err != nil {
|
||||
if !setting.IsInTesting {
|
||||
log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil // do not re-queue the failed items, otherwise some broken repo will block the queue
|
||||
}
|
||||
|
||||
indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler)
|
||||
if indexerQueue == nil {
|
||||
log.Fatal("Unable to create codes indexer queue")
|
||||
}
|
||||
default:
|
||||
log.Fatal("Unknown codes indexer type; %s", setting.Indexer.RepoType)
|
||||
}
|
||||
|
||||
go func() {
|
||||
pprof.SetGoroutineLabels(ctx)
|
||||
start := time.Now()
|
||||
var (
|
||||
rIndexer internal.Indexer
|
||||
existed bool
|
||||
err error
|
||||
)
|
||||
switch setting.Indexer.RepoType {
|
||||
case "bleve":
|
||||
log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), setting.Indexer.RepoPath)
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2))
|
||||
log.Error("The indexer files are likely corrupted and may need to be deleted")
|
||||
log.Error("You can completely remove the \"%s\" directory to make Gitea recreate the indexes", setting.Indexer.RepoPath)
|
||||
}
|
||||
}()
|
||||
|
||||
rIndexer = bleve.NewIndexer(setting.Indexer.RepoPath)
|
||||
existed, err = rIndexer.Init(ctx)
|
||||
if err != nil {
|
||||
cancel()
|
||||
(*globalIndexer.Load()).Close()
|
||||
close(waitChannel)
|
||||
log.Fatal("PID: %d Unable to initialize the bleve Repository Indexer at path: %s Error: %v", os.Getpid(), setting.Indexer.RepoPath, err)
|
||||
}
|
||||
case "elasticsearch":
|
||||
log.Info("PID: %d Initializing Repository Indexer at: %s", os.Getpid(), util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr))
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
log.Error("PANIC whilst initializing repository indexer: %v\nStacktrace: %s", err, log.Stack(2))
|
||||
log.Error("The indexer files are likely corrupted and may need to be deleted")
|
||||
log.Error("You can completely remove the \"%s\" index to make Gitea recreate the indexes", util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr))
|
||||
}
|
||||
}()
|
||||
|
||||
rIndexer = elasticsearch.NewIndexer(setting.Indexer.RepoConnStr, setting.Indexer.RepoIndexerName)
|
||||
existed, err = rIndexer.Init(ctx)
|
||||
if err != nil {
|
||||
cancel()
|
||||
(*globalIndexer.Load()).Close()
|
||||
close(waitChannel)
|
||||
log.Fatal("PID: %d Unable to initialize the elasticsearch Repository Indexer connstr: %s Error: %v", os.Getpid(), util.SanitizeCredentialURLs(setting.Indexer.RepoConnStr), err)
|
||||
}
|
||||
|
||||
default:
|
||||
log.Fatal("PID: %d Unknown Indexer type: %s", os.Getpid(), setting.Indexer.RepoType)
|
||||
}
|
||||
|
||||
globalIndexer.Store(&rIndexer)
|
||||
|
||||
// Start processing the queue
|
||||
go graceful.GetManager().RunWithCancel(indexerQueue)
|
||||
|
||||
if !existed { // populate the index because it's created for the first time
|
||||
go graceful.GetManager().RunWithShutdownContext(populateRepoIndexer)
|
||||
}
|
||||
select {
|
||||
case waitChannel <- time.Since(start):
|
||||
case <-graceful.GetManager().IsShutdown():
|
||||
}
|
||||
|
||||
close(waitChannel)
|
||||
}()
|
||||
|
||||
if setting.Indexer.StartupTimeout > 0 {
|
||||
go func() {
|
||||
pprof.SetGoroutineLabels(ctx)
|
||||
timeout := setting.Indexer.StartupTimeout
|
||||
if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 {
|
||||
timeout += setting.GracefulHammerTime
|
||||
}
|
||||
select {
|
||||
case <-graceful.GetManager().IsShutdown():
|
||||
log.Warn("Shutdown before Repository Indexer completed initialization")
|
||||
cancel()
|
||||
(*globalIndexer.Load()).Close()
|
||||
case duration, ok := <-waitChannel:
|
||||
if !ok {
|
||||
log.Warn("Repository Indexer Initialization failed")
|
||||
cancel()
|
||||
(*globalIndexer.Load()).Close()
|
||||
return
|
||||
}
|
||||
log.Info("Repository Indexer Initialization took %v", duration)
|
||||
case <-time.After(timeout):
|
||||
cancel()
|
||||
(*globalIndexer.Load()).Close()
|
||||
log.Fatal("Repository Indexer Initialization Timed-Out after: %v", timeout)
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateRepoIndexer update a repository's entries in the indexer
|
||||
func UpdateRepoIndexer(repo *repo_model.Repository) {
|
||||
indexData := &internal.IndexerData{RepoID: repo.ID}
|
||||
if err := indexerQueue.Push(indexData); err != nil {
|
||||
log.Error("Update repo index data %v failed: %v", indexData, err)
|
||||
}
|
||||
}
|
||||
|
||||
// IsAvailable checks if issue indexer is available
|
||||
func IsAvailable(ctx context.Context) bool {
|
||||
return (*globalIndexer.Load()).Ping(ctx) == nil
|
||||
}
|
||||
|
||||
// populateRepoIndexer populate the repo indexer with pre-existing data. This
|
||||
// should only be run when the indexer is created for the first time.
|
||||
func populateRepoIndexer(ctx context.Context) {
|
||||
log.Info("Populating the repo indexer with existing repositories")
|
||||
|
||||
exist, err := db.IsTableNotEmpty("repository")
|
||||
if err != nil {
|
||||
log.Fatal("System error: %v", err)
|
||||
} else if !exist {
|
||||
return
|
||||
}
|
||||
|
||||
// if there is any existing repo indexer metadata in the DB, delete it
|
||||
// since we are starting afresh. Also, xorm requires deletes to have a
|
||||
// condition, and we want to delete everything, thus 1=1.
|
||||
if err := db.DeleteAllRecords("repo_indexer_status"); err != nil {
|
||||
log.Fatal("System error: %v", err)
|
||||
}
|
||||
|
||||
var maxRepoID int64
|
||||
if maxRepoID, err = db.GetMaxID("repository"); err != nil {
|
||||
log.Fatal("System error: %v", err)
|
||||
}
|
||||
|
||||
// start with the maximum existing repo ID and work backwards, so that we
|
||||
// don't include repos that are created after gitea starts; such repos will
|
||||
// already be added to the indexer, and we don't need to add them again.
|
||||
for maxRepoID > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
log.Info("Repository Indexer population shutdown before completion")
|
||||
return
|
||||
default:
|
||||
}
|
||||
ids, err := repo_model.GetUnindexedRepos(ctx, repo_model.RepoIndexerTypeCode, maxRepoID, 0, 50)
|
||||
if err != nil {
|
||||
log.Error("populateRepoIndexer: %v", err)
|
||||
return
|
||||
} else if len(ids) == 0 {
|
||||
break
|
||||
}
|
||||
for _, id := range ids {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
log.Info("Repository Indexer population shutdown before completion")
|
||||
return
|
||||
default:
|
||||
}
|
||||
if err := indexerQueue.Push(&internal.IndexerData{RepoID: id}); err != nil {
|
||||
log.Error("indexerQueue.Push: %v", err)
|
||||
return
|
||||
}
|
||||
maxRepoID = id - 1
|
||||
}
|
||||
}
|
||||
log.Info("Done (re)populating the repo indexer with existing repositories")
|
||||
}
|
||||
|
||||
func SupportedSearchModes() []indexer.SearchMode {
|
||||
gi := globalIndexer.Load()
|
||||
if gi == nil {
|
||||
return nil
|
||||
}
|
||||
return (*gi).SupportedSearchModes()
|
||||
}
|
||||
@@ -0,0 +1,352 @@
|
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package code
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"slices"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.dev/models/db"
|
||||
"gitea.dev/models/unittest"
|
||||
indexer_module "gitea.dev/modules/indexer"
|
||||
"gitea.dev/modules/indexer/code/bleve"
|
||||
"gitea.dev/modules/indexer/code/elasticsearch"
|
||||
"gitea.dev/modules/indexer/code/internal"
|
||||
"gitea.dev/modules/setting"
|
||||
"gitea.dev/modules/test"
|
||||
"gitea.dev/modules/util"
|
||||
|
||||
_ "gitea.dev/models"
|
||||
_ "gitea.dev/models/actions"
|
||||
_ "gitea.dev/models/activities"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type codeSearchResult struct {
|
||||
Filename string
|
||||
Content string
|
||||
}
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
unittest.MainTest(m)
|
||||
}
|
||||
|
||||
func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer))
|
||||
// Wait for the index to catch up: ES/OpenSearch make writes visible
|
||||
// only after a refresh (default interval: 1s). Bleve is synchronous
|
||||
// and passes on the first iteration.
|
||||
require.Eventually(t, func() bool {
|
||||
total, _, _, err := indexer.Search(t.Context(), &internal.SearchOptions{
|
||||
Keyword: "Description",
|
||||
Paginator: &db.ListOptions{Page: 1, PageSize: 1},
|
||||
})
|
||||
return err == nil && total > 0
|
||||
}, 10*time.Second, 100*time.Millisecond, "index did not become searchable")
|
||||
|
||||
keywords := []struct {
|
||||
RepoIDs []int64
|
||||
Keyword string
|
||||
Langs int
|
||||
SearchMode indexer_module.SearchModeType
|
||||
Results []codeSearchResult
|
||||
}{
|
||||
// Search for an exact match on the contents of a file
|
||||
// This scenario yields a single result (the file README.md on the repo '1')
|
||||
{
|
||||
RepoIDs: nil,
|
||||
Keyword: "Description",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "README.md",
|
||||
Content: "# repo1\n\nDescription for repo1",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for an exact match on the contents of a file within the repo '2'.
|
||||
// This scenario yields no results
|
||||
{
|
||||
RepoIDs: []int64{2},
|
||||
Keyword: "Description",
|
||||
Langs: 0,
|
||||
},
|
||||
// Search for an exact match on the contents of a file
|
||||
// This scenario yields a single result (the file README.md on the repo '1')
|
||||
{
|
||||
RepoIDs: nil,
|
||||
Keyword: "repo1",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "README.md",
|
||||
Content: "# repo1\n\nDescription for repo1",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for an exact match on the contents of a file within the repo '2'.
|
||||
// This scenario yields no results
|
||||
{
|
||||
RepoIDs: []int64{2},
|
||||
Keyword: "repo1",
|
||||
Langs: 0,
|
||||
},
|
||||
// Search for a non-existing term.
|
||||
// This scenario yields no results
|
||||
{
|
||||
RepoIDs: nil,
|
||||
Keyword: "non-exist",
|
||||
Langs: 0,
|
||||
},
|
||||
// Search for an exact match on the contents of a file within the repo '62'.
|
||||
// This scenario yields a single result (the file avocado.md on the repo '62')
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "pineaple",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "avocado.md",
|
||||
Content: "# repo1\n\npineaple pie of cucumber juice",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for an exact match on the filename within the repo '62'.
|
||||
// This scenario yields a single result (the file avocado.md on the repo '62')
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "avocado.md",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "avocado.md",
|
||||
Content: "# repo1\n\npineaple pie of cucumber juice",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for an partial match on the filename within the repo '62'.
|
||||
// This scenario yields a single result (the file avocado.md on the repo '62')
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "avo",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "avocado.md",
|
||||
Content: "# repo1\n\npineaple pie of cucumber juice",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for matches on both the contents and the filenames within the repo '62'.
|
||||
// This scenario yields two results: the first result is based on the file (cucumber.md) while the second is based on the contents
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "cucumber",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "cucumber.md",
|
||||
Content: "Salad is good for your health",
|
||||
},
|
||||
{
|
||||
Filename: "avocado.md",
|
||||
Content: "# repo1\n\npineaple pie of cucumber juice",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for matches on the filenames within the repo '62'.
|
||||
// This scenario yields two results (both are based on filename, the first one is an exact match)
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "ham",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "ham.md",
|
||||
Content: "This is also not cheese",
|
||||
},
|
||||
{
|
||||
Filename: "potato/ham.md",
|
||||
Content: "This is not cheese",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for matches on the contents of files within the repo '62'.
|
||||
// This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "This is not cheese",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "potato/ham.md",
|
||||
Content: "This is not cheese",
|
||||
},
|
||||
{
|
||||
Filename: "ham.md",
|
||||
Content: "This is also not cheese",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for matches on the contents of files regardless of case.
|
||||
{
|
||||
RepoIDs: nil,
|
||||
Keyword: "dESCRIPTION",
|
||||
Langs: 1,
|
||||
SearchMode: indexer_module.SearchModeFuzzy,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "README.md",
|
||||
Content: "# repo1\n\nDescription for repo1",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for an exact match on the filename within the repo '62' (case-insensitive).
|
||||
// This scenario yields a single result (the file avocado.md on the repo '62')
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "AVOCADO.MD",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "avocado.md",
|
||||
Content: "# repo1\n\npineaple pie of cucumber juice",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for matches on the contents of files when the criteria are an expression.
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "console.log",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "example-file.js",
|
||||
Content: "console.log(\"Hello, World!\")",
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for matches on the contents of files when the criteria are parts of an expression.
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "log",
|
||||
Langs: 1,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "example-file.js",
|
||||
Content: "console.log(\"Hello, World!\")",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, kw := range keywords {
|
||||
t.Run(kw.Keyword, func(t *testing.T) {
|
||||
total, res, langs, err := indexer.Search(t.Context(), &internal.SearchOptions{
|
||||
RepoIDs: kw.RepoIDs,
|
||||
Keyword: kw.Keyword,
|
||||
SearchMode: util.IfZero(kw.SearchMode, indexer_module.SearchModeWords),
|
||||
Paginator: &db.ListOptions{
|
||||
Page: 1,
|
||||
PageSize: 10,
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, langs, kw.Langs)
|
||||
|
||||
hits := make([]codeSearchResult, 0, len(res))
|
||||
|
||||
if total > 0 {
|
||||
assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
|
||||
}
|
||||
|
||||
for _, hit := range res {
|
||||
hits = append(hits, codeSearchResult{
|
||||
Filename: hit.Filename,
|
||||
Content: hit.Content,
|
||||
})
|
||||
}
|
||||
|
||||
lastIndex := -1
|
||||
|
||||
for _, expected := range kw.Results {
|
||||
index := slices.Index(hits, expected)
|
||||
if index == -1 {
|
||||
assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
|
||||
} else if lastIndex > index {
|
||||
assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
|
||||
} else {
|
||||
lastIndex = index
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
assert.NoError(t, tearDownRepositoryIndexes(t.Context(), indexer))
|
||||
})
|
||||
}
|
||||
|
||||
func TestBleveIndexAndSearch(t *testing.T) {
|
||||
unittest.PrepareTestEnv(t)
|
||||
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
|
||||
dir := t.TempDir()
|
||||
|
||||
idx := bleve.NewIndexer(dir)
|
||||
defer idx.Close()
|
||||
|
||||
_, err := idx.Init(t.Context())
|
||||
require.NoError(t, err)
|
||||
|
||||
testIndexer("bleve", t, idx)
|
||||
}
|
||||
|
||||
func TestESIndexAndSearch(t *testing.T) {
|
||||
unittest.PrepareTestEnv(t)
|
||||
|
||||
u := os.Getenv("TEST_INDEXER_CODE_ES_URL")
|
||||
if u == "" {
|
||||
t.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
indexer := elasticsearch.NewIndexer(u, "gitea_codes")
|
||||
if _, err := indexer.Init(t.Context()); err != nil {
|
||||
if indexer != nil {
|
||||
indexer.Close()
|
||||
}
|
||||
require.NoError(t, err, "Unable to init ES indexer")
|
||||
}
|
||||
|
||||
defer indexer.Close()
|
||||
|
||||
testIndexer("elastic_search", t, indexer)
|
||||
}
|
||||
|
||||
func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
|
||||
for _, repoID := range repositoriesToSearch() {
|
||||
if err := index(ctx, indexer, repoID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func tearDownRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
|
||||
for _, repoID := range repositoriesToSearch() {
|
||||
if err := indexer.Delete(ctx, repoID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func repositoriesToSearch() []int64 {
|
||||
return []int64{1, 62}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
// Copyright 2023 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package internal
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
"gitea.dev/models/db"
|
||||
repo_model "gitea.dev/models/repo"
|
||||
"gitea.dev/modules/indexer"
|
||||
"gitea.dev/modules/indexer/internal"
|
||||
)
|
||||
|
||||
// Indexer defines an interface to index and search code contents
|
||||
type Indexer interface {
|
||||
internal.Indexer
|
||||
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
|
||||
Delete(ctx context.Context, repoID int64) error
|
||||
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
|
||||
SupportedSearchModes() []indexer.SearchMode
|
||||
}
|
||||
|
||||
type SearchOptions struct {
|
||||
RepoIDs []int64
|
||||
Keyword string
|
||||
Language string
|
||||
|
||||
SearchMode indexer.SearchModeType
|
||||
|
||||
db.Paginator
|
||||
}
|
||||
|
||||
// NewDummyIndexer returns a dummy indexer
|
||||
func NewDummyIndexer() Indexer {
|
||||
return &dummyIndexer{
|
||||
Indexer: internal.NewDummyIndexer(),
|
||||
}
|
||||
}
|
||||
|
||||
type dummyIndexer struct {
|
||||
internal.Indexer
|
||||
}
|
||||
|
||||
func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error {
|
||||
return errors.New("indexer is not ready")
|
||||
}
|
||||
|
||||
func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
|
||||
return errors.New("indexer is not ready")
|
||||
}
|
||||
|
||||
func (d *dummyIndexer) Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) {
|
||||
return 0, nil, nil, errors.New("indexer is not ready")
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
// Copyright 2023 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package internal
|
||||
|
||||
import "gitea.dev/modules/timeutil"
|
||||
|
||||
type FileUpdate struct {
|
||||
Filename string
|
||||
BlobSha string
|
||||
Size int64
|
||||
Sized bool
|
||||
}
|
||||
|
||||
// RepoChanges changes (file additions/updates/removals) to a repo
|
||||
type RepoChanges struct {
|
||||
Updates []FileUpdate
|
||||
RemovedFilenames []string
|
||||
}
|
||||
|
||||
// IndexerData represents data stored in the code indexer
|
||||
type IndexerData struct {
|
||||
RepoID int64
|
||||
}
|
||||
|
||||
// SearchResult result of performing a search in a repo
|
||||
type SearchResult struct {
|
||||
RepoID int64
|
||||
StartIndex int
|
||||
EndIndex int
|
||||
Filename string
|
||||
Content string
|
||||
CommitID string
|
||||
UpdatedUnix timeutil.TimeStamp
|
||||
Language string
|
||||
Color string
|
||||
}
|
||||
|
||||
// SearchResultLanguages result of top languages count in search results
|
||||
type SearchResultLanguages struct {
|
||||
Language string
|
||||
Color string
|
||||
Count int
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
// Copyright 2023 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package internal
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"gitea.dev/modules/indexer/internal"
|
||||
"gitea.dev/modules/log"
|
||||
)
|
||||
|
||||
const filenameMatchNumberOfLines = 7 // Copied from GitHub search
|
||||
|
||||
func FilenameIndexerID(repoID int64, filename string) string {
|
||||
return internal.Base36(repoID) + "_" + filename
|
||||
}
|
||||
|
||||
func ParseIndexerID(indexerID string) (int64, string) {
|
||||
before, after, ok := strings.Cut(indexerID, "_")
|
||||
if !ok {
|
||||
log.Error("Unexpected ID in repo indexer: %s", indexerID)
|
||||
}
|
||||
repoID, _ := internal.ParseBase36(before)
|
||||
return repoID, after
|
||||
}
|
||||
|
||||
func FilenameOfIndexerID(indexerID string) string {
|
||||
_, after, ok := strings.Cut(indexerID, "_")
|
||||
if !ok {
|
||||
log.Error("Unexpected ID in repo indexer: %s", indexerID)
|
||||
}
|
||||
return after
|
||||
}
|
||||
|
||||
// FilenameMatchIndexPos returns the boundaries of its first seven lines.
|
||||
func FilenameMatchIndexPos(content string) (int, int) {
|
||||
count := 1
|
||||
for i, c := range content {
|
||||
if c == '\n' {
|
||||
count++
|
||||
if count == filenameMatchNumberOfLines {
|
||||
return 0, i
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, len(content)
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
// Copyright 2017 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package code
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"html/template"
|
||||
"strings"
|
||||
|
||||
"gitea.dev/modules/highlight"
|
||||
"gitea.dev/modules/indexer/code/internal"
|
||||
"gitea.dev/modules/timeutil"
|
||||
)
|
||||
|
||||
// Result a search result to display
|
||||
type Result struct {
|
||||
RepoID int64
|
||||
Filename string
|
||||
CommitID string
|
||||
UpdatedUnix timeutil.TimeStamp
|
||||
Language string
|
||||
Color string
|
||||
Lines []*ResultLine
|
||||
}
|
||||
|
||||
type ResultLine struct {
|
||||
Num int
|
||||
FormattedContent template.HTML
|
||||
}
|
||||
|
||||
type SearchResultLanguages = internal.SearchResultLanguages
|
||||
|
||||
type SearchOptions = internal.SearchOptions
|
||||
|
||||
func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) {
|
||||
startIndex := selectionStartIndex
|
||||
numLinesBefore := 0
|
||||
for ; startIndex > 0; startIndex-- {
|
||||
if content[startIndex-1] == '\n' {
|
||||
if numLinesBefore == 1 {
|
||||
break
|
||||
}
|
||||
numLinesBefore++
|
||||
}
|
||||
}
|
||||
|
||||
endIndex := selectionEndIndex
|
||||
numLinesAfter := 0
|
||||
for ; endIndex < len(content); endIndex++ {
|
||||
if content[endIndex] == '\n' {
|
||||
if numLinesAfter == 1 {
|
||||
break
|
||||
}
|
||||
numLinesAfter++
|
||||
}
|
||||
}
|
||||
|
||||
return startIndex, endIndex
|
||||
}
|
||||
|
||||
func writeStrings(buf *bytes.Buffer, strs ...string) error {
|
||||
for _, s := range strs {
|
||||
_, err := buf.WriteString(s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine {
|
||||
// we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting
|
||||
lexer := highlight.DetectChromaLexerByFileName(filename, language)
|
||||
hl := highlight.RenderCodeByLexer(lexer, code)
|
||||
highlightedLines := highlight.UnsafeSplitHighlightedLines(hl)
|
||||
|
||||
// The lineNums outputted by render might not match the original lineNums, because "highlight" removes the last `\n`
|
||||
lines := make([]*ResultLine, min(len(highlightedLines), len(lineNums)))
|
||||
for i := range lines {
|
||||
lines[i] = &ResultLine{
|
||||
Num: lineNums[i],
|
||||
FormattedContent: template.HTML(highlightedLines[i]),
|
||||
}
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Result, error) {
|
||||
startLineNum := 1 + strings.Count(result.Content[:startIndex], "\n")
|
||||
|
||||
var formattedLinesBuffer bytes.Buffer
|
||||
|
||||
contentLines := strings.SplitAfter(result.Content[startIndex:endIndex], "\n")
|
||||
lineNums := make([]int, 0, len(contentLines))
|
||||
index := startIndex
|
||||
for i, line := range contentLines {
|
||||
var err error
|
||||
if index < result.EndIndex &&
|
||||
result.StartIndex < index+len(line) &&
|
||||
result.StartIndex < result.EndIndex {
|
||||
openActiveIndex := max(result.StartIndex-index, 0)
|
||||
closeActiveIndex := min(result.EndIndex-index, len(line))
|
||||
err = writeStrings(&formattedLinesBuffer,
|
||||
line[:openActiveIndex],
|
||||
line[openActiveIndex:closeActiveIndex],
|
||||
line[closeActiveIndex:],
|
||||
)
|
||||
} else {
|
||||
err = writeStrings(&formattedLinesBuffer, line)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
lineNums = append(lineNums, startLineNum+i)
|
||||
index += len(line)
|
||||
}
|
||||
|
||||
return &Result{
|
||||
RepoID: result.RepoID,
|
||||
Filename: result.Filename,
|
||||
CommitID: result.CommitID,
|
||||
UpdatedUnix: result.UpdatedUnix,
|
||||
Language: result.Language,
|
||||
Color: result.Color,
|
||||
Lines: HighlightSearchResultCode(result.Filename, result.Language, lineNums, formattedLinesBuffer.String()),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// PerformSearch perform a search on a repository
|
||||
func PerformSearch(ctx context.Context, opts *SearchOptions) (int64, []*Result, []*SearchResultLanguages, error) {
|
||||
if opts == nil || len(opts.Keyword) == 0 {
|
||||
return 0, nil, nil, nil
|
||||
}
|
||||
|
||||
total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, opts)
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
|
||||
displayResults := make([]*Result, len(results))
|
||||
|
||||
for i, result := range results {
|
||||
startIndex, endIndex := indices(result.Content, result.StartIndex, result.EndIndex)
|
||||
displayResults[i], err = searchResult(result, startIndex, endIndex)
|
||||
if err != nil {
|
||||
return 0, nil, nil, err
|
||||
}
|
||||
}
|
||||
return total, displayResults, resultLanguages, nil
|
||||
}
|
||||
Reference in New Issue
Block a user