Refactor indexer (#25174)

Refactor `modules/indexer` to make it more maintainable. And it can be easier to support more features. I'm trying to solve some of issue searching, this is a precursor to making functional changes. Current supported engines and the index versions: | engines | issues | code | | - | - | - | | db | Just a wrapper for database queries, doesn't need version | - | | bleve | The version of index is **2** | The version of index is **6** | | elasticsearch | The old index has no version, will be treated as version **0** in this PR | The version of index is **1** | | meilisearch | The old index has no version, will be treated as version **0** in this PR | - | ## Changes ### Split Splited it into mutiple packages ```text indexer ├── internal │ ├── bleve │ ├── db │ ├── elasticsearch │ └── meilisearch ├── code │ ├── bleve │ ├── elasticsearch │ └── internal └── issues ├── bleve ├── db ├── elasticsearch ├── internal └── meilisearch ``` - `indexer/interanal`: Internal shared package for indexer. - `indexer/interanal/[engine]`: Internal shared package for each engine (bleve/db/elasticsearch/meilisearch). - `indexer/code`: Implementations for code indexer. - `indexer/code/internal`: Internal shared package for code indexer. - `indexer/code/[engine]`: Implementation via each engine for code indexer. - `indexer/issues`: Implementations for issues indexer. ### Deduplication - Combine `Init/Ping/Close` for code indexer and issues indexer. - ~Combine `issues.indexerHolder` and `code.wrappedIndexer` to `internal.IndexHolder`.~ Remove it, use dummy indexer instead when the indexer is not ready. - Duplicate two copies of creating ES clients. - Duplicate two copies of `indexerID()`. ### Enhancement - [x] Support index version for elasticsearch issues indexer, the old index without version will be treated as version 0. - [x] Fix spell of `elastic_search/ElasticSearch`, it should be `Elasticsearch`. - [x] Improve versioning of ES index. We don't need `Aliases`: - Gitea does't need aliases for "Zero Downtime" because it never delete old indexes. - The old code of issues indexer uses the orignal name to create issue index, so it's tricky to convert it to an alias. - [x] Support index version for meilisearch issues indexer, the old index without version will be treated as version 0. - [x] Do "ping" only when `Ping` has been called, don't ping periodically and cache the status. - [x] Support the context parameter whenever possible. - [x] Fix outdated example config. - [x] Give up the requeue logic of issues indexer: When indexing fails, call Ping to check if it was caused by the engine being unavailable, and only requeue the task if the engine is unavailable. - It is fragile and tricky, could cause data losing (It did happen when I was doing some tests for this PR). And it works for ES only. - Just always requeue the failed task, if it caused by bad data, it's a bug of Gitea which should be fixed. --------- Co-authored-by: Giteabot <teabot@gitea.io>
2025-12-09 21:34:17 +09:00 · 2023-06-23 20:37:56 +08:00
parent b0215c40cd
commit 375fd15fbf
43 changed files with 1374 additions and 1426 deletions
--- a/modules/indexer/code/elasticsearch/elasticsearch.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch.go
@@ -0,0 +1,358 @@
+// Copyright 2020 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+
+	repo_model "code.gitea.io/gitea/models/repo"
+	"code.gitea.io/gitea/modules/analyze"
+	"code.gitea.io/gitea/modules/charset"
+	"code.gitea.io/gitea/modules/git"
+	"code.gitea.io/gitea/modules/indexer/code/internal"
+	indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
+	inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
+	"code.gitea.io/gitea/modules/json"
+	"code.gitea.io/gitea/modules/log"
+	"code.gitea.io/gitea/modules/setting"
+	"code.gitea.io/gitea/modules/timeutil"
+	"code.gitea.io/gitea/modules/typesniffer"
+
+	"github.com/go-enry/go-enry/v2"
+	"github.com/olivere/elastic/v7"
+)
+
+const (
+	esRepoIndexerLatestVersion = 1
+	// multi-match-types, currently only 2 types are used
+	// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
+	esMultiMatchTypeBestFields   = "best_fields"
+	esMultiMatchTypePhrasePrefix = "phrase_prefix"
+)
+
+var _ internal.Indexer = &Indexer{}
+
+// Indexer implements Indexer interface
+type Indexer struct {
+	inner                    *inner_elasticsearch.Indexer
+	indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
+}
+
+// NewIndexer creates a new elasticsearch indexer
+func NewIndexer(url, indexerName string) *Indexer {
+	inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
+	indexer := &Indexer{
+		inner:   inner,
+		Indexer: inner,
+	}
+	return indexer
+}
+
+const (
+	defaultMapping = `{
+		"mappings": {
+			"properties": {
+				"repo_id": {
+					"type": "long",
+					"index": true
+				},
+				"content": {
+					"type": "text",
+					"term_vector": "with_positions_offsets",
+					"index": true
+				},
+				"commit_id": {
+					"type": "keyword",
+					"index": true
+				},
+				"language": {
+					"type": "keyword",
+					"index": true
+				},
+				"updated_at": {
+					"type": "long",
+					"index": true
+				}
+			}
+		}
+	}`
+)
+
+func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
+	// Ignore vendored files in code search
+	if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
+		return nil, nil
+	}
+
+	size := update.Size
+	var err error
+	if !update.Sized {
+		var stdout string
+		stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
+		if err != nil {
+			return nil, err
+		}
+		if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
+			return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
+		}
+	}
+
+	if size > setting.Indexer.MaxIndexerFileSize {
+		return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
+	}
+
+	if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
+		return nil, err
+	}
+
+	_, _, size, err = git.ReadBatchLine(batchReader)
+	if err != nil {
+		return nil, err
+	}
+
+	fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
+	if err != nil {
+		return nil, err
+	} else if !typesniffer.DetectContentType(fileContents).IsText() {
+		// FIXME: UTF-16 files will probably fail here
+		return nil, nil
+	}
+
+	if _, err = batchReader.Discard(1); err != nil {
+		return nil, err
+	}
+	id := internal.FilenameIndexerID(repo.ID, update.Filename)
+
+	return []elastic.BulkableRequest{
+		elastic.NewBulkIndexRequest().
+			Index(b.inner.VersionedIndexName()).
+			Id(id).
+			Doc(map[string]interface{}{
+				"repo_id":    repo.ID,
+				"content":    string(charset.ToUTF8DropErrors(fileContents)),
+				"commit_id":  sha,
+				"language":   analyze.GetCodeLanguage(update.Filename, fileContents),
+				"updated_at": timeutil.TimeStampNow(),
+			}),
+	}, nil
+}
+
+func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
+	id := internal.FilenameIndexerID(repo.ID, filename)
+	return elastic.NewBulkDeleteRequest().
+		Index(b.inner.VersionedIndexName()).
+		Id(id)
+}
+
+// Index will save the index data
+func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
+	reqs := make([]elastic.BulkableRequest, 0)
+	if len(changes.Updates) > 0 {
+		// Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
+		if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
+			log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
+			return err
+		}
+
+		batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
+		defer cancel()
+
+		for _, update := range changes.Updates {
+			updateReqs, err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo)
+			if err != nil {
+				return err
+			}
+			if len(updateReqs) > 0 {
+				reqs = append(reqs, updateReqs...)
+			}
+		}
+		cancel()
+	}
+
+	for _, filename := range changes.RemovedFilenames {
+		reqs = append(reqs, b.addDelete(filename, repo))
+	}
+
+	if len(reqs) > 0 {
+		_, err := b.inner.Client.Bulk().
+			Index(b.inner.VersionedIndexName()).
+			Add(reqs...).
+			Do(ctx)
+		return err
+	}
+	return nil
+}
+
+// Delete deletes indexes by ids
+func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
+	_, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()).
+		Query(elastic.NewTermsQuery("repo_id", repoID)).
+		Do(ctx)
+	return err
+}
+
+// indexPos find words positions for start and the following end on content. It will
+// return the beginning position of the first start and the ending position of the
+// first end following the start string.
+// If not found any of the positions, it will return -1, -1.
+func indexPos(content, start, end string) (int, int) {
+	startIdx := strings.Index(content, start)
+	if startIdx < 0 {
+		return -1, -1
+	}
+	endIdx := strings.Index(content[startIdx+len(start):], end)
+	if endIdx < 0 {
+		return -1, -1
+	}
+	return startIdx, startIdx + len(start) + endIdx + len(end)
+}
+
+func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+	hits := make([]*internal.SearchResult, 0, pageSize)
+	for _, hit := range searchResult.Hits.Hits {
+		// FIXME: There is no way to get the position the keyword on the content currently on the same request.
+		// So we get it from content, this may made the query slower. See
+		// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
+		var startIndex, endIndex int
+		c, ok := hit.Highlight["content"]
+		if ok && len(c) > 0 {
+			// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
+			// now we should find the positions. But how to avoid html content which contains the
+			// <em> and </em> tags? If elastic search has handled that?
+			startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
+			if startIndex == -1 {
+				panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
+			}
+		} else {
+			panic(fmt.Sprintf("2===%#v", hit.Highlight))
+		}
+
+		repoID, fileName := internal.ParseIndexerID(hit.Id)
+		res := make(map[string]interface{})
+		if err := json.Unmarshal(hit.Source, &res); err != nil {
+			return 0, nil, nil, err
+		}
+
+		language := res["language"].(string)
+
+		hits = append(hits, &internal.SearchResult{
+			RepoID:      repoID,
+			Filename:    fileName,
+			CommitID:    res["commit_id"].(string),
+			Content:     res["content"].(string),
+			UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
+			Language:    language,
+			StartIndex:  startIndex,
+			EndIndex:    endIndex - 9, // remove the length <em></em> since we give Content the original data
+			Color:       enry.GetColor(language),
+		})
+	}
+
+	return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
+}
+
+func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages {
+	var searchResultLanguages []*internal.SearchResultLanguages
+	agg, found := searchResult.Aggregations.Terms("language")
+	if found {
+		searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10)
+
+		for _, bucket := range agg.Buckets {
+			searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
+				Language: bucket.Key.(string),
+				Color:    enry.GetColor(bucket.Key.(string)),
+				Count:    int(bucket.DocCount),
+			})
+		}
+	}
+	return searchResultLanguages
+}
+
+// Search searches for codes and language stats by given conditions.
+func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+	searchType := esMultiMatchTypeBestFields
+	if isMatch {
+		searchType = esMultiMatchTypePhrasePrefix
+	}
+
+	kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
+	query := elastic.NewBoolQuery()
+	query = query.Must(kwQuery)
+	if len(repoIDs) > 0 {
+		repoStrs := make([]interface{}, 0, len(repoIDs))
+		for _, repoID := range repoIDs {
+			repoStrs = append(repoStrs, repoID)
+		}
+		repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
+		query = query.Must(repoQuery)
+	}
+
+	var (
+		start       int
+		kw          = "<em>" + keyword + "</em>"
+		aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
+	)
+
+	if page > 0 {
+		start = (page - 1) * pageSize
+	}
+
+	if len(language) == 0 {
+		searchResult, err := b.inner.Client.Search().
+			Index(b.inner.VersionedIndexName()).
+			Aggregation("language", aggregation).
+			Query(query).
+			Highlight(
+				elastic.NewHighlight().
+					Field("content").
+					NumOfFragments(0). // return all highting content on fragments
+					HighlighterType("fvh"),
+			).
+			Sort("repo_id", true).
+			From(start).Size(pageSize).
+			Do(ctx)
+		if err != nil {
+			return 0, nil, nil, err
+		}
+
+		return convertResult(searchResult, kw, pageSize)
+	}
+
+	langQuery := elastic.NewMatchQuery("language", language)
+	countResult, err := b.inner.Client.Search().
+		Index(b.inner.VersionedIndexName()).
+		Aggregation("language", aggregation).
+		Query(query).
+		Size(0). // We only need stats information
+		Do(ctx)
+	if err != nil {
+		return 0, nil, nil, err
+	}
+
+	query = query.Must(langQuery)
+	searchResult, err := b.inner.Client.Search().
+		Index(b.inner.VersionedIndexName()).
+		Query(query).
+		Highlight(
+			elastic.NewHighlight().
+				Field("content").
+				NumOfFragments(0). // return all highting content on fragments
+				HighlighterType("fvh"),
+		).
+		Sort("repo_id", true).
+		From(start).Size(pageSize).
+		Do(ctx)
+	if err != nil {
+		return 0, nil, nil, err
+	}
+
+	total, hits, _, err := convertResult(searchResult, kw, pageSize)
+
+	return total, hits, extractAggs(countResult), err
+}
--- a/modules/indexer/code/elasticsearch/elasticsearch_test.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch_test.go
@@ -0,0 +1,16 @@
+// Copyright 2020 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package elasticsearch
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIndexPos(t *testing.T) {
+	startIdx, endIdx := indexPos("test index start and end", "start", "end")
+	assert.EqualValues(t, 11, startIdx)
+	assert.EqualValues(t, 24, endIdx)
+}