mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-27 00:23:41 +09:00 
			
		
		
		
	Improve issue & code search (#33860)
Each "indexer" should provide the "search modes" they support by themselves. And we need to remove the "fuzzy" search for code.
This commit is contained in:
		| @@ -17,6 +17,7 @@ import ( | ||||
| 	"code.gitea.io/gitea/modules/charset" | ||||
| 	"code.gitea.io/gitea/modules/git" | ||||
| 	"code.gitea.io/gitea/modules/gitrepo" | ||||
| 	"code.gitea.io/gitea/modules/indexer" | ||||
| 	path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path" | ||||
| 	"code.gitea.io/gitea/modules/indexer/code/internal" | ||||
| 	indexer_internal "code.gitea.io/gitea/modules/indexer/internal" | ||||
| @@ -136,6 +137,10 @@ type Indexer struct { | ||||
| 	indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much | ||||
| } | ||||
|  | ||||
| func (b *Indexer) SupportedSearchModes() []indexer.SearchMode { | ||||
| 	return indexer.SearchModesExactWords() | ||||
| } | ||||
|  | ||||
| // NewIndexer creates a new bleve local indexer | ||||
| func NewIndexer(indexDir string) *Indexer { | ||||
| 	inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping) | ||||
| @@ -267,19 +272,18 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int | ||||
| 	pathQuery.FieldVal = "Filename" | ||||
| 	pathQuery.SetBoost(10) | ||||
|  | ||||
| 	keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword) | ||||
| 	if isPhrase { | ||||
| 		q := bleve.NewMatchPhraseQuery(keywordAsPhrase) | ||||
| 	if opts.SearchMode == indexer.SearchModeExact { | ||||
| 		q := bleve.NewMatchPhraseQuery(opts.Keyword) | ||||
| 		q.FieldVal = "Content" | ||||
| 		if opts.IsKeywordFuzzy { | ||||
| 			q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(keywordAsPhrase) | ||||
| 		} | ||||
| 		contentQuery = q | ||||
| 	} else { | ||||
| 	} else /* words */ { | ||||
| 		q := bleve.NewMatchQuery(opts.Keyword) | ||||
| 		q.FieldVal = "Content" | ||||
| 		if opts.IsKeywordFuzzy { | ||||
| 		if opts.SearchMode == indexer.SearchModeFuzzy { | ||||
| 			// this logic doesn't seem right, it is only used to pass the test-case `Keyword:    "dESCRIPTION"`, which doesn't seem to be a real-life use-case. | ||||
| 			q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) | ||||
| 		} else { | ||||
| 			q.Operator = query.MatchQueryOperatorAnd | ||||
| 		} | ||||
| 		contentQuery = q | ||||
| 	} | ||||
|   | ||||
| @@ -16,6 +16,7 @@ import ( | ||||
| 	"code.gitea.io/gitea/modules/charset" | ||||
| 	"code.gitea.io/gitea/modules/git" | ||||
| 	"code.gitea.io/gitea/modules/gitrepo" | ||||
| 	"code.gitea.io/gitea/modules/indexer" | ||||
| 	"code.gitea.io/gitea/modules/indexer/code/internal" | ||||
| 	indexer_internal "code.gitea.io/gitea/modules/indexer/internal" | ||||
| 	inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch" | ||||
| @@ -24,7 +25,6 @@ import ( | ||||
| 	"code.gitea.io/gitea/modules/setting" | ||||
| 	"code.gitea.io/gitea/modules/timeutil" | ||||
| 	"code.gitea.io/gitea/modules/typesniffer" | ||||
| 	"code.gitea.io/gitea/modules/util" | ||||
|  | ||||
| 	"github.com/go-enry/go-enry/v2" | ||||
| 	"github.com/olivere/elastic/v7" | ||||
| @@ -46,6 +46,10 @@ type Indexer struct { | ||||
| 	indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much | ||||
| } | ||||
|  | ||||
| func (b *Indexer) SupportedSearchModes() []indexer.SearchMode { | ||||
| 	return indexer.SearchModesExactWords() | ||||
| } | ||||
|  | ||||
| // NewIndexer creates a new elasticsearch indexer | ||||
| func NewIndexer(url, indexerName string) *Indexer { | ||||
| 	inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping) | ||||
| @@ -361,15 +365,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan | ||||
| // Search searches for codes and language stats by given conditions. | ||||
| func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { | ||||
| 	var contentQuery elastic.Query | ||||
| 	keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword) | ||||
| 	if isPhrase { | ||||
| 		contentQuery = elastic.NewMatchPhraseQuery("content", keywordAsPhrase) | ||||
| 	} else { | ||||
| 		// TODO: this is the old logic, but not really using "fuzziness" | ||||
| 		// * IsKeywordFuzzy=true: "best_fields" | ||||
| 		// * IsKeywordFuzzy=false: "phrase_prefix" | ||||
| 		contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword). | ||||
| 			Type(util.Iif(opts.IsKeywordFuzzy, esMultiMatchTypeBestFields, esMultiMatchTypePhrasePrefix)) | ||||
| 	if opts.SearchMode == indexer.SearchModeExact { | ||||
| 		contentQuery = elastic.NewMatchPhraseQuery("content", opts.Keyword) | ||||
| 	} else /* words */ { | ||||
| 		contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).Type(esMultiMatchTypeBestFields).Operator("and") | ||||
| 	} | ||||
| 	kwQuery := elastic.NewBoolQuery().Should( | ||||
| 		contentQuery, | ||||
|   | ||||
| @@ -9,6 +9,7 @@ import ( | ||||
| 	"strings" | ||||
|  | ||||
| 	"code.gitea.io/gitea/modules/git" | ||||
| 	"code.gitea.io/gitea/modules/indexer" | ||||
| 	code_indexer "code.gitea.io/gitea/modules/indexer/code" | ||||
| 	"code.gitea.io/gitea/modules/setting" | ||||
| ) | ||||
| @@ -23,11 +24,16 @@ func indexSettingToGitGrepPathspecList() (list []string) { | ||||
| 	return list | ||||
| } | ||||
|  | ||||
| func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, isFuzzy bool) (searchResults []*code_indexer.Result, total int, err error) { | ||||
| 	// TODO: it should also respect ParseKeywordAsPhrase and clarify the "fuzzy" behavior | ||||
| func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, searchMode indexer.SearchModeType) (searchResults []*code_indexer.Result, total int, err error) { | ||||
| 	grepMode := git.GrepModeWords | ||||
| 	if searchMode == indexer.SearchModeExact { | ||||
| 		grepMode = git.GrepModeExact | ||||
| 	} else if searchMode == indexer.SearchModeRegexp { | ||||
| 		grepMode = git.GrepModeRegexp | ||||
| 	} | ||||
| 	res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{ | ||||
| 		ContextLineNumber: 1, | ||||
| 		IsFuzzy:           isFuzzy, | ||||
| 		GrepMode:          grepMode, | ||||
| 		RefName:           ref.String(), | ||||
| 		PathspecList:      indexSettingToGitGrepPathspecList(), | ||||
| 	}) | ||||
|   | ||||
| @@ -14,6 +14,7 @@ import ( | ||||
| 	"code.gitea.io/gitea/models/db" | ||||
| 	repo_model "code.gitea.io/gitea/models/repo" | ||||
| 	"code.gitea.io/gitea/modules/graceful" | ||||
| 	"code.gitea.io/gitea/modules/indexer" | ||||
| 	"code.gitea.io/gitea/modules/indexer/code/bleve" | ||||
| 	"code.gitea.io/gitea/modules/indexer/code/elasticsearch" | ||||
| 	"code.gitea.io/gitea/modules/indexer/code/internal" | ||||
| @@ -302,3 +303,11 @@ func populateRepoIndexer(ctx context.Context) { | ||||
| 	} | ||||
| 	log.Info("Done (re)populating the repo indexer with existing repositories") | ||||
| } | ||||
|  | ||||
| func SupportedSearchModes() []indexer.SearchMode { | ||||
| 	gi := globalIndexer.Load() | ||||
| 	if gi == nil { | ||||
| 		return nil | ||||
| 	} | ||||
| 	return (*gi).SupportedSearchModes() | ||||
| } | ||||
|   | ||||
| @@ -11,6 +11,7 @@ import ( | ||||
|  | ||||
| 	"code.gitea.io/gitea/models/db" | ||||
| 	"code.gitea.io/gitea/models/unittest" | ||||
| 	indexer_module "code.gitea.io/gitea/modules/indexer" | ||||
| 	"code.gitea.io/gitea/modules/indexer/code/bleve" | ||||
| 	"code.gitea.io/gitea/modules/indexer/code/elasticsearch" | ||||
| 	"code.gitea.io/gitea/modules/indexer/code/internal" | ||||
| @@ -39,10 +40,11 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { | ||||
| 		assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer)) | ||||
|  | ||||
| 		keywords := []struct { | ||||
| 			RepoIDs []int64 | ||||
| 			Keyword string | ||||
| 			Langs   int | ||||
| 			Results []codeSearchResult | ||||
| 			RepoIDs    []int64 | ||||
| 			Keyword    string | ||||
| 			Langs      int | ||||
| 			SearchMode indexer_module.SearchModeType | ||||
| 			Results    []codeSearchResult | ||||
| 		}{ | ||||
| 			// Search for an exact match on the contents of a file | ||||
| 			// This scenario yields a single result (the file README.md on the repo '1') | ||||
| @@ -183,9 +185,10 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { | ||||
| 			}, | ||||
| 			// Search for matches on the contents of files regardless of case. | ||||
| 			{ | ||||
| 				RepoIDs: nil, | ||||
| 				Keyword: "dESCRIPTION", | ||||
| 				Langs:   1, | ||||
| 				RepoIDs:    nil, | ||||
| 				Keyword:    "dESCRIPTION", | ||||
| 				Langs:      1, | ||||
| 				SearchMode: indexer_module.SearchModeFuzzy, | ||||
| 				Results: []codeSearchResult{ | ||||
| 					{ | ||||
| 						Filename: "README.md", | ||||
| @@ -193,7 +196,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			// Search for an exact match on the filename within the repo '62' (case insenstive). | ||||
| 			// Search for an exact match on the filename within the repo '62' (case-insensitive). | ||||
| 			// This scenario yields a single result (the file avocado.md on the repo '62') | ||||
| 			{ | ||||
| 				RepoIDs: []int64{62}, | ||||
| @@ -206,7 +209,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			// Search for matches on the contents of files when the criteria is a expression. | ||||
| 			// Search for matches on the contents of files when the criteria are an expression. | ||||
| 			{ | ||||
| 				RepoIDs: []int64{62}, | ||||
| 				Keyword: "console.log", | ||||
| @@ -218,7 +221,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { | ||||
| 					}, | ||||
| 				}, | ||||
| 			}, | ||||
| 			// Search for matches on the contents of files when the criteria is part of a expression. | ||||
| 			// Search for matches on the contents of files when the criteria are parts of an expression. | ||||
| 			{ | ||||
| 				RepoIDs: []int64{62}, | ||||
| 				Keyword: "log", | ||||
| @@ -235,16 +238,16 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { | ||||
| 		for _, kw := range keywords { | ||||
| 			t.Run(kw.Keyword, func(t *testing.T) { | ||||
| 				total, res, langs, err := indexer.Search(t.Context(), &internal.SearchOptions{ | ||||
| 					RepoIDs: kw.RepoIDs, | ||||
| 					Keyword: kw.Keyword, | ||||
| 					RepoIDs:    kw.RepoIDs, | ||||
| 					Keyword:    kw.Keyword, | ||||
| 					SearchMode: kw.SearchMode, | ||||
| 					Paginator: &db.ListOptions{ | ||||
| 						Page:     1, | ||||
| 						PageSize: 10, | ||||
| 					}, | ||||
| 					IsKeywordFuzzy: true, | ||||
| 				}) | ||||
| 				assert.NoError(t, err) | ||||
| 				assert.Len(t, langs, kw.Langs) | ||||
| 				require.NoError(t, err) | ||||
| 				require.Len(t, langs, kw.Langs) | ||||
|  | ||||
| 				hits := make([]codeSearchResult, 0, len(res)) | ||||
|  | ||||
| @@ -289,7 +292,7 @@ func TestBleveIndexAndSearch(t *testing.T) { | ||||
| 	_, err := idx.Init(t.Context()) | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	testIndexer("beleve", t, idx) | ||||
| 	testIndexer("bleve", t, idx) | ||||
| } | ||||
|  | ||||
| func TestESIndexAndSearch(t *testing.T) { | ||||
|   | ||||
| @@ -9,6 +9,7 @@ import ( | ||||
|  | ||||
| 	"code.gitea.io/gitea/models/db" | ||||
| 	repo_model "code.gitea.io/gitea/models/repo" | ||||
| 	"code.gitea.io/gitea/modules/indexer" | ||||
| 	"code.gitea.io/gitea/modules/indexer/internal" | ||||
| ) | ||||
|  | ||||
| @@ -18,6 +19,7 @@ type Indexer interface { | ||||
| 	Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error | ||||
| 	Delete(ctx context.Context, repoID int64) error | ||||
| 	Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) | ||||
| 	SupportedSearchModes() []indexer.SearchMode | ||||
| } | ||||
|  | ||||
| type SearchOptions struct { | ||||
| @@ -25,7 +27,7 @@ type SearchOptions struct { | ||||
| 	Keyword  string | ||||
| 	Language string | ||||
|  | ||||
| 	IsKeywordFuzzy bool | ||||
| 	SearchMode indexer.SearchModeType | ||||
|  | ||||
| 	db.Paginator | ||||
| } | ||||
| @@ -41,6 +43,10 @@ type dummyIndexer struct { | ||||
| 	internal.Indexer | ||||
| } | ||||
|  | ||||
| func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode { | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error { | ||||
| 	return fmt.Errorf("indexer is not ready") | ||||
| } | ||||
|   | ||||
| @@ -10,9 +10,7 @@ import ( | ||||
| 	"code.gitea.io/gitea/modules/log" | ||||
| ) | ||||
|  | ||||
| const ( | ||||
| 	filenameMatchNumberOfLines = 7 // Copied from github search | ||||
| ) | ||||
| const filenameMatchNumberOfLines = 7 // Copied from GitHub search | ||||
|  | ||||
| func FilenameIndexerID(repoID int64, filename string) string { | ||||
| 	return internal.Base36(repoID) + "_" + filename | ||||
| @@ -48,11 +46,3 @@ func FilenameMatchIndexPos(content string) (int, int) { | ||||
| 	} | ||||
| 	return 0, len(content) | ||||
| } | ||||
|  | ||||
| func ParseKeywordAsPhrase(keyword string) (string, bool) { | ||||
| 	if strings.HasPrefix(keyword, `"`) && strings.HasSuffix(keyword, `"`) && len(keyword) > 1 { | ||||
| 		// only remove the prefix and suffix quotes, no need to decode the content at the moment | ||||
| 		return keyword[1 : len(keyword)-1], true | ||||
| 	} | ||||
| 	return "", false | ||||
| } | ||||
|   | ||||
| @@ -1,30 +0,0 @@ | ||||
| // Copyright 2025 The Gitea Authors. All rights reserved. | ||||
| // SPDX-License-Identifier: MIT | ||||
|  | ||||
| package internal | ||||
|  | ||||
| import ( | ||||
| 	"testing" | ||||
|  | ||||
| 	"github.com/stretchr/testify/assert" | ||||
| ) | ||||
|  | ||||
| func TestParseKeywordAsPhrase(t *testing.T) { | ||||
| 	cases := []struct { | ||||
| 		keyword  string | ||||
| 		phrase   string | ||||
| 		isPhrase bool | ||||
| 	}{ | ||||
| 		{``, "", false}, | ||||
| 		{`a`, "", false}, | ||||
| 		{`"`, "", false}, | ||||
| 		{`"a`, "", false}, | ||||
| 		{`"a"`, "a", true}, | ||||
| 		{`""\"""`, `"\""`, true}, | ||||
| 	} | ||||
| 	for _, c := range cases { | ||||
| 		phrase, isPhrase := ParseKeywordAsPhrase(c.keyword) | ||||
| 		assert.Equal(t, c.phrase, phrase, "keyword=%q", c.keyword) | ||||
| 		assert.Equal(t, c.isPhrase, isPhrase, "keyword=%q", c.keyword) | ||||
| 	} | ||||
| } | ||||
| @@ -129,7 +129,6 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res | ||||
| } | ||||
|  | ||||
| // PerformSearch perform a search on a repository | ||||
| // if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2 | ||||
| func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) { | ||||
| 	if opts == nil || len(opts.Keyword) == 0 { | ||||
| 		return 0, nil, nil, nil | ||||
|   | ||||
		Reference in New Issue
	
	Block a user