Replace olivere/elastic with REST API client, add OpenSearch support (#37411)

Drops `github.com/olivere/elastic/v7` (unmaintained) and replaces it
with a small in-house wrapper that speaks the Elasticsearch REST API
directly via `net/http`. The subset used by Gitea (`_cluster/health`,
`_bulk`, `_doc`, `_delete_by_query`, `_refresh`, `_search`, `HEAD`/`PUT`
index) is stable across the targeted servers, so no client library is
needed.

**Targets tested**
- Elasticsearch 7, 8, 9
- OpenSearch 1, 2, 3

**Why not `go-elasticsearch`?**
The official client enforces an `X-Elastic-Product` server-identity
check that OpenSearch deliberately fails, which would force shipping a
transport shim to defeat it. Going direct over `net/http` removes that
fight along with several MB of transitive deps (`elastic-transport-go`,
`go.opentelemetry.io/otel{,/metric,/trace}`, `auto/sdk`, `easyjson`,
`intern`, `logr`, `stdr`).

Replaces: #30755
Fixes: https://github.com/go-gitea/gitea/issues/30752

---
This PR was written with the help of Claude Opus 4.7

---------

Co-authored-by: Claude (Opus 4.7) <noreply@anthropic.com>
Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
This commit is contained in:
silverwind
2026-05-02 00:12:54 +02:00
committed by GitHub
parent 31cee60cc7
commit abcfa53040
15 changed files with 825 additions and 361 deletions

View File

@@ -18,8 +18,7 @@ import (
"code.gitea.io/gitea/modules/gitrepo"
"code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
es "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
"code.gitea.io/gitea/modules/json"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
@@ -28,23 +27,15 @@ import (
"code.gitea.io/gitea/modules/util"
"github.com/go-enry/go-enry/v2"
"github.com/olivere/elastic/v7"
)
const (
esRepoIndexerLatestVersion = 3
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
esMultiMatchTypePhrasePrefix = "phrase_prefix"
)
const esRepoIndexerLatestVersion = 3
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
inner *inner_elasticsearch.Indexer
indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
*es.Indexer
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
@@ -53,12 +44,7 @@ func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
// NewIndexer creates a new elasticsearch indexer
func NewIndexer(url, indexerName string) *Indexer {
inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
indexer := &Indexer{
inner: inner,
Indexer: inner,
}
return indexer
return &Indexer{Indexer: es.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)}
}
const (
@@ -138,7 +124,7 @@ const (
}`
)
func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]es.BulkOp, error) {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
return nil, nil
@@ -157,8 +143,9 @@ func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch,
}
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
if size > setting.Indexer.MaxIndexerFileSize {
return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
return []es.BulkOp{es.DeleteOp(id)}, nil
}
info, batchReader, err := catFileBatch.QueryContent(update.BlobSha)
@@ -177,33 +164,24 @@ func (b *Indexer) addUpdate(ctx context.Context, catFileBatch git.CatFileBatch,
if _, err = batchReader.Discard(1); err != nil {
return nil, err
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
return []elastic.BulkableRequest{
elastic.NewBulkIndexRequest().
Index(b.inner.VersionedIndexName()).
Id(id).
Doc(map[string]any{
"repo_id": repo.ID,
"filename": update.Filename,
"content": string(charset.ToUTF8DropErrors(fileContents)),
"commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
"updated_at": timeutil.TimeStampNow(),
}),
}, nil
return []es.BulkOp{es.IndexOp(id, map[string]any{
"repo_id": repo.ID,
"filename": update.Filename,
"content": string(charset.ToUTF8DropErrors(fileContents)),
"commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
"updated_at": timeutil.TimeStampNow(),
})}, nil
}
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
id := internal.FilenameIndexerID(repo.ID, filename)
return elastic.NewBulkDeleteRequest().
Index(b.inner.VersionedIndexName()).
Id(id)
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) es.BulkOp {
return es.DeleteOp(internal.FilenameIndexerID(repo.ID, filename))
}
// Index will save the index data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
reqs := make([]elastic.BulkableRequest, 0)
ops := make([]es.BulkOp, 0)
if len(changes.Updates) > 0 {
batch, err := gitrepo.NewBatch(ctx, repo)
if err != nil {
@@ -212,29 +190,25 @@ func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha st
defer batch.Close()
for _, update := range changes.Updates {
updateReqs, err := b.addUpdate(ctx, batch, sha, update, repo)
updateOps, err := b.addUpdate(ctx, batch, sha, update, repo)
if err != nil {
return err
}
if len(updateReqs) > 0 {
reqs = append(reqs, updateReqs...)
if len(updateOps) > 0 {
ops = append(ops, updateOps...)
}
}
}
for _, filename := range changes.RemovedFilenames {
reqs = append(reqs, b.addDelete(filename, repo))
ops = append(ops, b.addDelete(filename, repo))
}
if len(reqs) > 0 {
if len(ops) > 0 {
esBatchSize := 50
for i := 0; i < len(reqs); i += esBatchSize {
_, err := b.inner.Client.Bulk().
Index(b.inner.VersionedIndexName()).
Add(reqs[i:min(i+esBatchSize, len(reqs))]...).
Do(ctx)
if err != nil {
for i := 0; i < len(ops); i += esBatchSize {
if err := b.Bulk(ctx, ops[i:min(i+esBatchSize, len(ops))]); err != nil {
return err
}
}
@@ -246,33 +220,21 @@ func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha st
func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
if err := b.doDelete(ctx, repoID); err != nil {
// Maybe there is a conflict during the delete operation, so we should retry after a refresh
log.Warn("Deletion of entries of repo %v within index %v was erroneous. Trying to refresh index before trying again", repoID, b.inner.VersionedIndexName(), err)
if err := b.refreshIndex(ctx); err != nil {
log.Warn("Deletion of entries of repo %v within index %v was erroneous: %v. Trying to refresh index before trying again", repoID, b.VersionedIndexName(), err)
if err := b.Refresh(ctx); err != nil {
return err
}
if err := b.doDelete(ctx, repoID); err != nil {
log.Error("Could not delete entries of repo %v within index %v", repoID, b.inner.VersionedIndexName())
log.Error("Could not delete entries of repo %v within index %v", repoID, b.VersionedIndexName())
return err
}
}
return nil
}
func (b *Indexer) refreshIndex(ctx context.Context) error {
if _, err := b.inner.Client.Refresh(b.inner.VersionedIndexName()).Do(ctx); err != nil {
log.Error("Error while trying to refresh index %v", b.inner.VersionedIndexName(), err)
return err
}
return nil
}
// Delete entries by repoId
func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
_, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()).
Query(elastic.NewTermsQuery("repo_id", repoID)).
Do(ctx)
return err
return b.DeleteByQuery(ctx, es.TermsQuery("repo_id", repoID))
}
// contentMatchIndexPos find words positions for start and the following end on content. It will
@@ -291,10 +253,10 @@ func contentMatchIndexPos(content, start, end string) (int, int) {
return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
}
func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
func convertResult(searchResult *es.SearchResponse, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
hits := make([]*internal.SearchResult, 0, pageSize)
for _, hit := range searchResult.Hits.Hits {
repoID, fileName := internal.ParseIndexerID(hit.Id)
for _, hit := range searchResult.Hits {
repoID, fileName := internal.ParseIndexerID(hit.ID)
res := make(map[string]any)
if err := json.Unmarshal(hit.Source, &res); err != nil {
return 0, nil, nil, err
@@ -333,111 +295,111 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
})
}
return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
return searchResult.Total, hits, extractAggs(searchResult), nil
}
func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages {
var searchResultLanguages []*internal.SearchResultLanguages
agg, found := searchResult.Aggregations.Terms("language")
if found {
searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10)
for _, bucket := range agg.Buckets {
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
Language: bucket.Key.(string),
Color: enry.GetColor(bucket.Key.(string)),
Count: int(bucket.DocCount),
})
func extractAggs(searchResult *es.SearchResponse) []*internal.SearchResultLanguages {
buckets, found := searchResult.Aggregations["language"]
if !found {
return nil
}
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
for _, bucket := range buckets {
// language is mapped as keyword so the key is always a string; if the
// mapping ever changes, skip rather than emit an empty-language bucket.
key, ok := bucket.Key.(string)
if !ok {
continue
}
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
Language: key,
Color: enry.GetColor(key),
Count: int(bucket.DocCount),
})
}
return searchResultLanguages
}
// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var contentQuery elastic.Query
searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
contentQuery := es.Query(es.NewMultiMatchQuery(opts.Keyword, "content").Type(es.MultiMatchTypeBestFields).Operator("and"))
if searchMode == indexer.SearchModeExact {
// 1.21 used NewMultiMatchQuery().Type(esMultiMatchTypePhrasePrefix), but later releases changed to NewMatchPhraseQuery
contentQuery = elastic.NewMatchPhraseQuery("content", opts.Keyword)
} else /* words */ {
contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).Type(esMultiMatchTypeBestFields).Operator("and")
contentQuery = es.MatchPhraseQuery("content", opts.Keyword)
}
kwQuery := elastic.NewBoolQuery().Should(
kwQuery := es.NewBoolQuery().Should(
contentQuery,
elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix),
es.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(es.MultiMatchTypePhrasePrefix),
)
query := elastic.NewBoolQuery()
query = query.Must(kwQuery)
query := es.NewBoolQuery().Must(kwQuery)
if len(opts.RepoIDs) > 0 {
repoStrs := make([]any, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
repoStrs = append(repoStrs, repoID)
}
repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
query = query.Must(repoQuery)
query.Must(es.TermsQuery("repo_id", es.ToAnySlice(opts.RepoIDs)...))
}
var (
start, pageSize = opts.GetSkipTake()
kw = "<em>" + opts.Keyword + "</em>"
aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
)
start, pageSize := opts.GetSkipTake()
kw := "<em>" + opts.Keyword + "</em>"
languageAggs := map[string]any{
"language": map[string]any{
"terms": map[string]any{
"field": "language",
"size": 10,
"order": map[string]any{"_count": "desc"},
},
},
}
// number_of_fragments=0 returns the full highlighted content (no fragmentation).
highlight := map[string]any{
"fields": map[string]any{
"content": map[string]any{},
"filename": map[string]any{},
},
"number_of_fragments": 0,
"type": "fvh",
}
sort := []es.SortField{
{Field: "_score", Desc: true},
{Field: "updated_at", Desc: false},
}
if len(opts.Language) == 0 {
searchResult, err := b.inner.Client.Search().
Index(b.inner.VersionedIndexName()).
Aggregation("language", aggregation).
Query(query).
Highlight(
elastic.NewHighlight().
Field("content").
Field("filename").
NumOfFragments(0). // return all highlighting content on fragments
HighlighterType("fvh"),
).
Sort("_score", false).
Sort("updated_at", true).
From(start).Size(pageSize).
Do(ctx)
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Sort: sort,
From: start,
Size: pageSize,
TrackTotal: true,
Aggregations: languageAggs,
Highlight: highlight,
})
if err != nil {
return 0, nil, nil, err
}
return convertResult(searchResult, kw, pageSize)
return convertResult(resp, kw, pageSize)
}
langQuery := elastic.NewMatchQuery("language", opts.Language)
countResult, err := b.inner.Client.Search().
Index(b.inner.VersionedIndexName()).
Aggregation("language", aggregation).
Query(query).
Size(0). // We only need stats information
Do(ctx)
countResp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Size: 0, // stats only
TrackTotal: true,
Aggregations: languageAggs,
})
if err != nil {
return 0, nil, nil, err
}
query = query.Must(langQuery)
searchResult, err := b.inner.Client.Search().
Index(b.inner.VersionedIndexName()).
Query(query).
Highlight(
elastic.NewHighlight().
Field("content").
Field("filename").
NumOfFragments(0). // return all highlighting content on fragments
HighlighterType("fvh"),
).
Sort("_score", false).
Sort("updated_at", true).
From(start).Size(pageSize).
Do(ctx)
query.Must(es.MatchQuery("language", opts.Language))
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Sort: sort,
From: start,
Size: pageSize,
TrackTotal: true,
Highlight: highlight,
})
if err != nil {
return 0, nil, nil, err
}
total, hits, _, err := convertResult(searchResult, kw, pageSize)
return total, hits, extractAggs(countResult), err
total, hits, _, err := convertResult(resp, kw, pageSize)
return total, hits, extractAggs(countResp), err
}

View File

@@ -8,6 +8,7 @@ import (
"os"
"slices"
"testing"
"time"
"code.gitea.io/gitea/models/db"
"code.gitea.io/gitea/models/unittest"
@@ -39,6 +40,16 @@ func TestMain(m *testing.M) {
func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
t.Run(name, func(t *testing.T) {
assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer))
// Wait for the index to catch up: ES/OpenSearch make writes visible
// only after a refresh (default interval: 1s). Bleve is synchronous
// and passes on the first iteration.
require.Eventually(t, func() bool {
total, _, _, err := indexer.Search(t.Context(), &internal.SearchOptions{
Keyword: "Description",
Paginator: &db.ListOptions{Page: 1, PageSize: 1},
})
return err == nil && total > 0
}, 10*time.Second, 100*time.Millisecond, "index did not become searchable")
keywords := []struct {
RepoIDs []int64

View File

@@ -4,52 +4,80 @@
package elasticsearch
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"net"
"net/http"
"net/url"
"slices"
"strconv"
"strings"
"time"
"code.gitea.io/gitea/modules/indexer/internal"
"github.com/olivere/elastic/v7"
"code.gitea.io/gitea/modules/json"
)
var _ internal.Indexer = &Indexer{}
// Indexer represents a basic elasticsearch indexer implementation
// Indexer is a narrow wrapper around an Elasticsearch/OpenSearch cluster.
// It targets the REST subset shared by Elasticsearch 7/8/9 and OpenSearch 3.
type Indexer struct {
Client *elastic.Client
client *http.Client
base string // base URL with trailing slash, no userinfo
user string
pass string
url string
indexName string
version int
mapping string
}
func NewIndexer(url, indexName string, version int, mapping string) *Indexer {
// NewIndexer builds an Indexer. The connection is opened by Init.
func NewIndexer(rawURL, indexName string, version int, mapping string) *Indexer {
return &Indexer{
url: url,
base: rawURL,
indexName: indexName,
version: version,
mapping: mapping,
}
}
// Init initializes the indexer
// Init connects and creates the versioned index if missing, returning true if it already existed.
func (i *Indexer) Init(ctx context.Context) (bool, error) {
if i == nil {
return false, errors.New("cannot init nil indexer")
}
if i.Client != nil {
return false, errors.New("indexer is already initialized")
}
client, err := i.initClient()
parsed, err := url.Parse(i.base)
if err != nil {
return false, err
return false, fmt.Errorf("parse elasticsearch url: %w", err)
}
if parsed.User != nil {
i.user = parsed.User.Username()
i.pass, _ = parsed.User.Password()
parsed.User = nil
}
base := parsed.String()
if !strings.HasSuffix(base, "/") {
base += "/"
}
i.base = base
// No client-level Timeout: bulk/_delete_by_query can legitimately run for
// minutes on large repos. Per-request deadlines come from the caller's ctx;
// transport-level timeouts cover stalled connects/handshakes/headers so a
// half-open server cannot wedge the indexer indefinitely.
i.client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{Timeout: 30 * time.Second, KeepAlive: 30 * time.Second}).DialContext,
TLSHandshakeTimeout: 10 * time.Second,
ResponseHeaderTimeout: 30 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
IdleConnTimeout: 90 * time.Second,
MaxIdleConns: 100,
},
}
i.Client = client
exists, err := i.Client.IndexExists(i.VersionedIndexName()).Do(ctx)
exists, err := i.indexExists(ctx, i.VersionedIndexName())
if err != nil {
return false, err
}
@@ -61,34 +89,321 @@ func (i *Indexer) Init(ctx context.Context) (bool, error) {
return false, err
}
return exists, nil
return false, nil
}
// Ping checks if the indexer is available
// Ping returns an error when the cluster is unusable (status != green/yellow).
func (i *Indexer) Ping(ctx context.Context) error {
if i == nil {
return errors.New("cannot ping nil indexer")
var body struct {
Status string `json:"status"`
}
if i.Client == nil {
return errors.New("indexer is not initialized")
}
resp, err := i.Client.ClusterHealth().Do(ctx)
if err != nil {
if err := i.doJSON(ctx, http.MethodGet, "_cluster/health", nil, &body); err != nil {
return err
}
if resp.Status != "green" && resp.Status != "yellow" {
// It's healthy if the status is green, and it's available if the status is yellow,
// see https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-health.html
return fmt.Errorf("status of elasticsearch cluster is %s", resp.Status)
// Healthy = green; usable = yellow. Red is unusable.
// https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-health.html
if body.Status != "green" && body.Status != "yellow" {
return fmt.Errorf("status of elasticsearch cluster is %s", body.Status)
}
return nil
}
// Close closes the indexer
// Close releases idle HTTP connections held by the client.
func (i *Indexer) Close() {
if i == nil {
if i == nil || i.client == nil {
return
}
i.Client = nil
i.client.CloseIdleConnections()
i.client = nil
}
// Bulk submits index/delete ops. Returns the first item-level failure, if any.
func (i *Indexer) Bulk(ctx context.Context, ops []BulkOp) error {
if len(ops) == 0 {
return nil
}
index := i.VersionedIndexName()
var buf bytes.Buffer
buf.Grow(len(ops) * 256)
for _, op := range ops {
meta := map[string]any{op.action: map[string]any{"_index": index, "_id": op.id}}
if err := writeJSONLine(&buf, meta); err != nil {
return err
}
if op.action == bulkActionIndex {
if err := writeJSONLine(&buf, op.doc); err != nil {
return err
}
}
}
res, err := i.do(ctx, http.MethodPost, urlPath(index, "_bulk"), "application/x-ndjson", bytes.NewReader(buf.Bytes()))
if err != nil {
return err
}
defer drainAndClose(res)
var body struct {
Errors bool `json:"errors"`
Items []map[string]struct {
Status int `json:"status"`
Error json.Value `json:"error"`
} `json:"items"`
}
if err := json.NewDecoder(res.Body).Decode(&body); err != nil {
return err
}
if !body.Errors {
return nil
}
return firstBulkError(body.Items)
}
// firstBulkError returns the first item-level failure in a bulk response.
// Each items entry is a single-key map ({"index": {...}} or {"delete": {...}}).
// Delete-of-missing (404) is idempotent and not reported.
func firstBulkError(items []map[string]struct {
Status int `json:"status"`
Error json.Value `json:"error"`
},
) error {
for _, item := range items {
for action, result := range item {
if action == bulkActionDelete && result.Status == http.StatusNotFound {
continue
}
if result.Status >= 300 {
return fmt.Errorf("bulk %s failed (status %d): %s", action, result.Status, string(result.Error))
}
}
}
return nil
}
// Index writes a single document.
func (i *Indexer) Index(ctx context.Context, id string, doc any) error {
body, err := json.Marshal(doc)
if err != nil {
return err
}
return i.doJSON(ctx, http.MethodPut, urlPath(i.VersionedIndexName(), "_doc", id), bytes.NewReader(body), nil)
}
// Delete removes a single document by id. Missing ids are not an error.
func (i *Indexer) Delete(ctx context.Context, id string) error {
res, err := i.do(ctx, http.MethodDelete, urlPath(i.VersionedIndexName(), "_doc", id), "", nil, http.StatusNotFound)
if err != nil {
return err
}
drainAndClose(res)
return nil
}
// DeleteByQuery removes every document matching the query.
func (i *Indexer) DeleteByQuery(ctx context.Context, query Query) error {
body, err := json.Marshal(map[string]any{"query": query.querySource()})
if err != nil {
return err
}
return i.doJSON(ctx, http.MethodPost, urlPath(i.VersionedIndexName(), "_delete_by_query"), bytes.NewReader(body), nil)
}
// Refresh forces a refresh so recent writes are searchable.
func (i *Indexer) Refresh(ctx context.Context) error {
return i.doJSON(ctx, http.MethodPost, urlPath(i.VersionedIndexName(), "_refresh"), nil, nil)
}
// Search runs a search request and decodes the reply.
func (i *Indexer) Search(ctx context.Context, req SearchRequest) (*SearchResponse, error) {
body := map[string]any{}
if req.Query != nil {
body["query"] = req.Query.querySource()
}
if len(req.Sort) > 0 {
sorts := make([]map[string]any, len(req.Sort))
for idx, s := range req.Sort {
sorts[idx] = s.source()
}
body["sort"] = sorts
}
if req.From > 0 {
body["from"] = req.From
}
body["size"] = req.Size
if len(req.Aggregations) > 0 {
body["aggs"] = req.Aggregations
}
if len(req.Highlight) > 0 {
body["highlight"] = req.Highlight
}
payload, err := json.Marshal(body)
if err != nil {
return nil, err
}
// Default track_total_hits is 10000 (capped count); send it explicitly so
// callers can choose between exact totals (true) and skipping counting (false).
path := urlPath(i.VersionedIndexName(), "_search") + "?track_total_hits=" + strconv.FormatBool(req.TrackTotal)
res, err := i.do(ctx, http.MethodPost, path, "application/json", bytes.NewReader(payload))
if err != nil {
return nil, err
}
defer drainAndClose(res)
return decodeSearchResponse(res.Body)
}
func (i *Indexer) indexExists(ctx context.Context, name string) (bool, error) {
res, err := i.do(ctx, http.MethodHead, urlPath(name), "", nil, http.StatusNotFound)
if err != nil {
return false, err
}
drainAndClose(res)
return res.StatusCode == http.StatusOK, nil
}
func (i *Indexer) createIndex(ctx context.Context) error {
var body struct {
Acknowledged bool `json:"acknowledged"`
}
if err := i.doJSON(ctx, http.MethodPut, urlPath(i.VersionedIndexName()), bytes.NewBufferString(i.mapping), &body); err != nil {
return fmt.Errorf("create index %s: %w", i.VersionedIndexName(), err)
}
if !body.Acknowledged {
return fmt.Errorf("create index %s not acknowledged", i.VersionedIndexName())
}
i.checkOldIndexes(ctx)
return nil
}
// do sends a request and returns the response. Status >= 300 is turned into
// an error unless the status appears in okStatus. The caller closes Body.
func (i *Indexer) do(ctx context.Context, method, path, contentType string, body io.Reader, okStatus ...int) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, method, i.base+path, body)
if err != nil {
return nil, err
}
if contentType != "" {
req.Header.Set("Content-Type", contentType)
}
if i.user != "" || i.pass != "" {
req.SetBasicAuth(i.user, i.pass)
}
res, err := i.client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode >= 300 && !slices.Contains(okStatus, res.StatusCode) {
msg := readErrBody(res)
res.Body.Close()
return nil, fmt.Errorf("%s %s: %s", method, path, msg)
}
return res, nil
}
// doJSON sends a request with a JSON body and, when out is non-nil, decodes
// the JSON response into it.
func (i *Indexer) doJSON(ctx context.Context, method, path string, body io.Reader, out any) error {
contentType := ""
if body != nil {
contentType = "application/json"
}
res, err := i.do(ctx, method, path, contentType, body)
if err != nil {
return err
}
defer drainAndClose(res)
if out == nil {
return nil
}
return json.NewDecoder(res.Body).Decode(out)
}
// drainAndClose discards any unread response body before closing so the
// underlying TCP connection can be reused for keep-alive.
func drainAndClose(res *http.Response) {
_, _ = io.Copy(io.Discard, res.Body)
res.Body.Close()
}
func writeJSONLine(buf *bytes.Buffer, v any) error {
enc, err := json.Marshal(v)
if err != nil {
return err
}
buf.Write(enc)
buf.WriteByte('\n')
return nil
}
// readErrBody reads up to 4 KiB of an error response and drains the rest so
// the underlying connection can be reused (keep-alive needs Body fully read).
func readErrBody(res *http.Response) string {
const limit = 4 << 10
b, _ := io.ReadAll(io.LimitReader(res.Body, limit))
_, _ = io.Copy(io.Discard, res.Body)
return fmt.Sprintf("status %d: %s", res.StatusCode, bytes.TrimSpace(b))
}
func decodeSearchResponse(r io.Reader) (*SearchResponse, error) {
var raw struct {
Hits struct {
Total struct {
Value int64 `json:"value"`
} `json:"total"`
Hits []struct {
ID string `json:"_id"`
Score float64 `json:"_score"`
Source json.Value `json:"_source"`
Highlight map[string][]string `json:"highlight"`
} `json:"hits"`
} `json:"hits"`
Aggregations map[string]struct {
Buckets []struct {
Key any `json:"key"`
DocCount int64 `json:"doc_count"`
} `json:"buckets"`
} `json:"aggregations"`
}
if err := json.NewDecoder(r).Decode(&raw); err != nil {
return nil, err
}
resp := &SearchResponse{
Total: raw.Hits.Total.Value,
Hits: make([]SearchHit, 0, len(raw.Hits.Hits)),
}
for _, h := range raw.Hits.Hits {
resp.Hits = append(resp.Hits, SearchHit{
ID: h.ID,
Score: h.Score,
Source: h.Source,
Highlight: h.Highlight,
})
}
if len(raw.Aggregations) > 0 {
resp.Aggregations = make(map[string][]AggBucket, len(raw.Aggregations))
for name, agg := range raw.Aggregations {
buckets := make([]AggBucket, len(agg.Buckets))
for idx, b := range agg.Buckets {
buckets[idx] = AggBucket{Key: b.Key, DocCount: b.DocCount}
}
resp.Aggregations[name] = buckets
}
}
return resp, nil
}
// urlPath joins path segments with `/` and percent-escapes each.
func urlPath(segments ...string) string {
var b bytes.Buffer
for idx, s := range segments {
if idx > 0 {
b.WriteByte('/')
}
b.WriteString(url.PathEscape(s))
}
return b.String()
}

View File

@@ -0,0 +1,44 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import (
"os"
"strings"
"testing"
"github.com/stretchr/testify/require"
)
func newRealIndexer(t *testing.T) *Indexer {
t.Helper()
url := "http://elasticsearch:9200"
if os.Getenv("CI") == "" {
url = os.Getenv("TEST_ELASTICSEARCH_URL")
if url == "" {
t.Skip("TEST_ELASTICSEARCH_URL not set and not running in CI")
}
}
indexName := "gitea_test_" + strings.ReplaceAll(strings.ToLower(t.Name()), "/", "_")
ix := NewIndexer(url, indexName, 1, `{"mappings":{"properties":{"x":{"type":"keyword"}}}}`)
_, err := ix.Init(t.Context())
require.NoError(t, err)
t.Cleanup(ix.Close)
return ix
}
func TestPing(t *testing.T) {
ix := newRealIndexer(t)
require.NoError(t, ix.Ping(t.Context()))
}
func TestDeleteSwallows404(t *testing.T) {
ix := newRealIndexer(t)
require.NoError(t, ix.Delete(t.Context(), "missing-id"))
}
func TestBulkAcceptsDelete404(t *testing.T) {
ix := newRealIndexer(t)
require.NoError(t, ix.Bulk(t.Context(), []BulkOp{DeleteOp("missing-id")}))
}

View File

@@ -0,0 +1,132 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
// MultiMatch types used by the call sites. See
// https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html#multi-match-types
const (
MultiMatchTypeBestFields = "best_fields"
MultiMatchTypePhrasePrefix = "phrase_prefix"
)
// ToAnySlice converts []T to []any for variadic query args like TermsQuery.
func ToAnySlice[T any](s []T) []any {
out := make([]any, len(s))
for idx, v := range s {
out[idx] = v
}
return out
}
// Query is an Elasticsearch query DSL node. It marshals to the JSON
// object expected by the ES query API.
type Query interface {
querySource() map[string]any
}
type rawQuery map[string]any
func (q rawQuery) querySource() map[string]any { return q }
// TermQuery matches documents whose `field` exactly equals `value`.
func TermQuery(field string, value any) Query {
return rawQuery{"term": map[string]any{field: value}}
}
// TermsQuery matches documents whose `field` equals any of `values`.
func TermsQuery(field string, values ...any) Query {
return rawQuery{"terms": map[string]any{field: values}}
}
// MatchQuery is a full-text match on a single field.
func MatchQuery(field string, value any) Query {
return rawQuery{"match": map[string]any{field: value}}
}
// MatchPhraseQuery matches the exact phrase on `field`.
func MatchPhraseQuery(field, value string) Query {
return rawQuery{"match_phrase": map[string]any{field: value}}
}
// MultiMatchQuery is the fluent builder for a multi_match query.
type MultiMatchQuery struct {
query any
fields []string
typ string
operator string
}
// NewMultiMatchQuery creates a multi_match query over the given fields.
func NewMultiMatchQuery(query any, fields ...string) *MultiMatchQuery {
return &MultiMatchQuery{query: query, fields: fields}
}
func (m *MultiMatchQuery) Type(t string) *MultiMatchQuery { m.typ = t; return m }
func (m *MultiMatchQuery) Operator(op string) *MultiMatchQuery { m.operator = op; return m }
func (m *MultiMatchQuery) querySource() map[string]any {
body := map[string]any{"query": m.query}
if len(m.fields) > 0 {
body["fields"] = m.fields
}
if m.typ != "" {
body["type"] = m.typ
}
if m.operator != "" {
body["operator"] = m.operator
}
return map[string]any{"multi_match": body}
}
// RangeQuery is the fluent builder for a range query.
type RangeQuery struct {
field string
body map[string]any
}
func NewRangeQuery(field string) *RangeQuery {
return &RangeQuery{field: field, body: map[string]any{}}
}
func (r *RangeQuery) Gte(v any) *RangeQuery { r.body["gte"] = v; return r }
func (r *RangeQuery) Lte(v any) *RangeQuery { r.body["lte"] = v; return r }
func (r *RangeQuery) querySource() map[string]any {
return map[string]any{"range": map[string]any{r.field: r.body}}
}
// BoolQuery is the fluent builder for a bool query.
type BoolQuery struct {
must []Query
should []Query
mustNot []Query
}
func NewBoolQuery() *BoolQuery { return &BoolQuery{} }
func (b *BoolQuery) Must(q ...Query) *BoolQuery { b.must = append(b.must, q...); return b }
func (b *BoolQuery) Should(q ...Query) *BoolQuery { b.should = append(b.should, q...); return b }
func (b *BoolQuery) MustNot(q ...Query) *BoolQuery { b.mustNot = append(b.mustNot, q...); return b }
func (b *BoolQuery) querySource() map[string]any {
body := map[string]any{}
if len(b.must) > 0 {
body["must"] = querySlice(b.must)
}
if len(b.should) > 0 {
body["should"] = querySlice(b.should)
}
if len(b.mustNot) > 0 {
body["must_not"] = querySlice(b.mustNot)
}
return map[string]any{"bool": body}
}
func querySlice(queries []Query) []map[string]any {
out := make([]map[string]any, len(queries))
for idx, q := range queries {
out[idx] = q.querySource()
}
return out
}

View File

@@ -0,0 +1,76 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package elasticsearch
import "code.gitea.io/gitea/modules/json"
const (
bulkActionIndex = "index"
bulkActionDelete = "delete"
)
// BulkOp is a single write inside a Bulk call. Construct with IndexOp or DeleteOp.
type BulkOp struct {
action string
id string
doc any
}
// IndexOp builds a bulk index operation.
func IndexOp(id string, doc any) BulkOp {
return BulkOp{action: bulkActionIndex, id: id, doc: doc}
}
// DeleteOp builds a bulk delete operation.
func DeleteOp(id string) BulkOp {
return BulkOp{action: bulkActionDelete, id: id}
}
// SortField is one entry of the search sort array.
type SortField struct {
Field string
Desc bool
}
func (s SortField) source() map[string]any {
order := "asc"
if s.Desc {
order = "desc"
}
return map[string]any{s.Field: map[string]any{"order": order}}
}
// SearchRequest captures everything Gitea sends to the _search endpoint.
// Aggregations and Highlight are raw ES JSON bodies — callers write them as
// map[string]any since each has exactly one call site with a fixed shape.
type SearchRequest struct {
Query Query
Sort []SortField
From int
Size int
TrackTotal bool
Aggregations map[string]any
Highlight map[string]any
}
// SearchHit is a single result row.
type SearchHit struct {
ID string
Score float64
Source json.Value
Highlight map[string][]string
}
// AggBucket is a terms-aggregation bucket.
type AggBucket struct {
Key any
DocCount int64
}
// SearchResponse is Gitea's decoded view of the search reply.
type SearchResponse struct {
Total int64
Hits []SearchHit
Aggregations map[string][]AggBucket
}

View File

@@ -6,14 +6,11 @@ package elasticsearch
import (
"context"
"fmt"
"time"
"code.gitea.io/gitea/modules/log"
"github.com/olivere/elastic/v7"
)
// VersionedIndexName returns the full index name with version
// VersionedIndexName returns the full index name with version suffix.
func (i *Indexer) VersionedIndexName() string {
return versionedIndexName(i.indexName, i.version)
}
@@ -26,41 +23,10 @@ func versionedIndexName(indexName string, version int) string {
return fmt.Sprintf("%s.v%d", indexName, version)
}
func (i *Indexer) createIndex(ctx context.Context) error {
createIndex, err := i.Client.CreateIndex(i.VersionedIndexName()).BodyString(i.mapping).Do(ctx)
if err != nil {
return err
}
if !createIndex.Acknowledged {
return fmt.Errorf("create index %s with %s failed", i.VersionedIndexName(), i.mapping)
}
i.checkOldIndexes(ctx)
return nil
}
func (i *Indexer) initClient() (*elastic.Client, error) {
opts := []elastic.ClientOptionFunc{
elastic.SetURL(i.url),
elastic.SetSniff(false),
elastic.SetHealthcheckInterval(10 * time.Second),
elastic.SetGzip(false),
}
logger := log.GetLogger(log.DEFAULT)
opts = append(opts, elastic.SetTraceLog(&log.PrintfLogger{Logf: logger.Trace}))
opts = append(opts, elastic.SetInfoLog(&log.PrintfLogger{Logf: logger.Info}))
opts = append(opts, elastic.SetErrorLog(&log.PrintfLogger{Logf: logger.Error}))
return elastic.NewClient(opts...)
}
func (i *Indexer) checkOldIndexes(ctx context.Context) {
for v := 0; v < i.version; v++ {
for v := range i.version {
indexName := versionedIndexName(i.indexName, v)
exists, err := i.Client.IndexExists(indexName).Do(ctx)
exists, err := i.indexExists(ctx, indexName)
if err == nil && exists {
log.Warn("Found older elasticsearch index named %q, Gitea will keep the old NOT DELETED. You can delete the old version after the upgrade succeed.", indexName)
}

View File

@@ -11,27 +11,18 @@ import (
"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/indexer"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
es "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
"code.gitea.io/gitea/modules/indexer/issues/internal"
"code.gitea.io/gitea/modules/util"
"github.com/olivere/elastic/v7"
)
const (
issueIndexerLatestVersion = 3
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
esMultiMatchTypePhrasePrefix = "phrase_prefix"
)
const issueIndexerLatestVersion = 3
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
type Indexer struct {
inner *inner_elasticsearch.Indexer
indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
*es.Indexer
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
@@ -41,12 +32,7 @@ func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
// NewIndexer creates a new elasticsearch indexer
func NewIndexer(url, indexerName string) *Indexer {
inner := inner_elasticsearch.NewIndexer(url, indexerName, issueIndexerLatestVersion, defaultMapping)
indexer := &Indexer{
inner: inner,
Indexer: inner,
}
return indexer
return &Indexer{Indexer: es.NewIndexer(url, indexerName, issueIndexerLatestVersion, defaultMapping)}
}
const (
@@ -93,29 +79,14 @@ func (b *Indexer) Index(ctx context.Context, issues ...*internal.IndexerData) er
return nil
} else if len(issues) == 1 {
issue := issues[0]
_, err := b.inner.Client.Index().
Index(b.inner.VersionedIndexName()).
Id(strconv.FormatInt(issue.ID, 10)).
BodyJson(issue).
Do(ctx)
return err
return b.Indexer.Index(ctx, strconv.FormatInt(issue.ID, 10), issue)
}
reqs := make([]elastic.BulkableRequest, 0)
ops := make([]es.BulkOp, 0, len(issues))
for _, issue := range issues {
reqs = append(reqs,
elastic.NewBulkIndexRequest().
Index(b.inner.VersionedIndexName()).
Id(strconv.FormatInt(issue.ID, 10)).
Doc(issue),
)
ops = append(ops, es.IndexOp(strconv.FormatInt(issue.ID, 10), issue))
}
_, err := b.inner.Client.Bulk().
Index(b.inner.VersionedIndexName()).
Add(reqs...).
Do(graceful.GetManager().HammerContext())
return err
return b.Bulk(graceful.GetManager().HammerContext(), ops)
}
// Delete deletes indexes by ids
@@ -123,129 +94,116 @@ func (b *Indexer) Delete(ctx context.Context, ids ...int64) error {
if len(ids) == 0 {
return nil
} else if len(ids) == 1 {
_, err := b.inner.Client.Delete().
Index(b.inner.VersionedIndexName()).
Id(strconv.FormatInt(ids[0], 10)).
Do(ctx)
return err
return b.Indexer.Delete(ctx, strconv.FormatInt(ids[0], 10))
}
reqs := make([]elastic.BulkableRequest, 0)
ops := make([]es.BulkOp, 0, len(ids))
for _, id := range ids {
reqs = append(reqs,
elastic.NewBulkDeleteRequest().
Index(b.inner.VersionedIndexName()).
Id(strconv.FormatInt(id, 10)),
)
ops = append(ops, es.DeleteOp(strconv.FormatInt(id, 10)))
}
_, err := b.inner.Client.Bulk().
Index(b.inner.VersionedIndexName()).
Add(reqs...).
Do(graceful.GetManager().HammerContext())
return err
return b.Bulk(graceful.GetManager().HammerContext(), ops)
}
// Search searches for issues by given conditions.
// Returns the matching issue IDs
func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (*internal.SearchResult, error) {
query := elastic.NewBoolQuery()
query := es.NewBoolQuery()
if options.Keyword != "" {
searchMode := util.IfZero(options.SearchMode, b.SupportedSearchModes()[0].ModeValue)
mm := es.NewMultiMatchQuery(options.Keyword, "title", "content", "comments")
if searchMode == indexer.SearchModeExact {
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(esMultiMatchTypePhrasePrefix))
} else /* words */ {
query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(esMultiMatchTypeBestFields).Operator("and"))
mm = mm.Type(es.MultiMatchTypePhrasePrefix)
} else {
mm = mm.Type(es.MultiMatchTypeBestFields).Operator("and")
}
query.Must(mm)
}
if len(options.RepoIDs) > 0 {
q := elastic.NewBoolQuery()
q.Should(elastic.NewTermsQuery("repo_id", toAnySlice(options.RepoIDs)...))
q := es.NewBoolQuery()
q.Should(es.TermsQuery("repo_id", es.ToAnySlice(options.RepoIDs)...))
if options.AllPublic {
q.Should(elastic.NewTermQuery("is_public", true))
q.Should(es.TermQuery("is_public", true))
}
query.Must(q)
}
if options.IsPull.Has() {
query.Must(elastic.NewTermQuery("is_pull", options.IsPull.Value()))
query.Must(es.TermQuery("is_pull", options.IsPull.Value()))
}
if options.IsClosed.Has() {
query.Must(elastic.NewTermQuery("is_closed", options.IsClosed.Value()))
query.Must(es.TermQuery("is_closed", options.IsClosed.Value()))
}
if options.IsArchived.Has() {
query.Must(elastic.NewTermQuery("is_archived", options.IsArchived.Value()))
query.Must(es.TermQuery("is_archived", options.IsArchived.Value()))
}
if options.NoLabelOnly {
query.Must(elastic.NewTermQuery("no_label", true))
query.Must(es.TermQuery("no_label", true))
} else {
if len(options.IncludedLabelIDs) > 0 {
q := elastic.NewBoolQuery()
q := es.NewBoolQuery()
for _, labelID := range options.IncludedLabelIDs {
q.Must(elastic.NewTermQuery("label_ids", labelID))
q.Must(es.TermQuery("label_ids", labelID))
}
query.Must(q)
} else if len(options.IncludedAnyLabelIDs) > 0 {
query.Must(elastic.NewTermsQuery("label_ids", toAnySlice(options.IncludedAnyLabelIDs)...))
query.Must(es.TermsQuery("label_ids", es.ToAnySlice(options.IncludedAnyLabelIDs)...))
}
if len(options.ExcludedLabelIDs) > 0 {
q := elastic.NewBoolQuery()
q := es.NewBoolQuery()
for _, labelID := range options.ExcludedLabelIDs {
q.MustNot(elastic.NewTermQuery("label_ids", labelID))
q.MustNot(es.TermQuery("label_ids", labelID))
}
query.Must(q)
}
}
if len(options.MilestoneIDs) > 0 {
query.Must(elastic.NewTermsQuery("milestone_id", toAnySlice(options.MilestoneIDs)...))
query.Must(es.TermsQuery("milestone_id", es.ToAnySlice(options.MilestoneIDs)...))
}
if options.NoProjectOnly {
query.Must(elastic.NewTermQuery("no_project", true))
query.Must(es.TermQuery("no_project", true))
} else if len(options.ProjectIDs) > 0 {
// FIXME: ISSUE-MULTIPLE-PROJECTS-FILTER: this logic is not right, it should use "AND" but not "OR"
query.Must(elastic.NewTermsQuery("project_ids", toAnySlice(options.ProjectIDs)...))
query.Must(es.TermsQuery("project_ids", es.ToAnySlice(options.ProjectIDs)...))
}
if options.PosterID != "" {
// "(none)" becomes 0, it means no poster
posterIDInt64, _ := strconv.ParseInt(options.PosterID, 10, 64)
query.Must(elastic.NewTermQuery("poster_id", posterIDInt64))
query.Must(es.TermQuery("poster_id", posterIDInt64))
}
if options.AssigneeID != "" {
if options.AssigneeID == "(any)" {
q := elastic.NewRangeQuery("assignee_id")
q.Gte(1)
query.Must(q)
query.Must(es.NewRangeQuery("assignee_id").Gte(1))
} else {
// "(none)" becomes 0, it means no assignee
assigneeIDInt64, _ := strconv.ParseInt(options.AssigneeID, 10, 64)
query.Must(elastic.NewTermQuery("assignee_id", assigneeIDInt64))
query.Must(es.TermQuery("assignee_id", assigneeIDInt64))
}
}
if options.MentionID.Has() {
query.Must(elastic.NewTermQuery("mention_ids", options.MentionID.Value()))
query.Must(es.TermQuery("mention_ids", options.MentionID.Value()))
}
if options.ReviewedID.Has() {
query.Must(elastic.NewTermQuery("reviewed_ids", options.ReviewedID.Value()))
query.Must(es.TermQuery("reviewed_ids", options.ReviewedID.Value()))
}
if options.ReviewRequestedID.Has() {
query.Must(elastic.NewTermQuery("review_requested_ids", options.ReviewRequestedID.Value()))
query.Must(es.TermQuery("review_requested_ids", options.ReviewRequestedID.Value()))
}
if options.SubscriberID.Has() {
query.Must(elastic.NewTermQuery("subscriber_ids", options.SubscriberID.Value()))
query.Must(es.TermQuery("subscriber_ids", options.SubscriberID.Value()))
}
if options.UpdatedAfterUnix.Has() || options.UpdatedBeforeUnix.Has() {
q := elastic.NewRangeQuery("updated_unix")
q := es.NewRangeQuery("updated_unix")
if options.UpdatedAfterUnix.Has() {
q.Gte(options.UpdatedAfterUnix.Value())
}
@@ -258,9 +216,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
if options.SortBy == "" {
options.SortBy = internal.SortByCreatedAsc
}
sortBy := []elastic.Sorter{
sortBy := []es.SortField{
parseSortBy(options.SortBy),
elastic.NewFieldSort("id").Desc(),
{Field: "id", Desc: true},
}
// See https://stackoverflow.com/questions/35206409/elasticsearch-2-1-result-window-is-too-large-index-max-result-window/35221900
@@ -268,43 +226,30 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
const maxPageSize = 10000
skip, limit := indexer_internal.ParsePaginator(options.Paginator, maxPageSize)
searchResult, err := b.inner.Client.Search().
Index(b.inner.VersionedIndexName()).
Query(query).
SortBy(sortBy...).
From(skip).Size(limit).
Do(ctx)
resp, err := b.Indexer.Search(ctx, es.SearchRequest{
Query: query,
Sort: sortBy,
From: skip,
Size: limit,
TrackTotal: true,
})
if err != nil {
return nil, err
}
hits := make([]internal.Match, 0, limit)
for _, hit := range searchResult.Hits.Hits {
id, _ := strconv.ParseInt(hit.Id, 10, 64)
hits = append(hits, internal.Match{
ID: id,
})
hits := make([]internal.Match, 0, len(resp.Hits))
for _, hit := range resp.Hits {
id, _ := strconv.ParseInt(hit.ID, 10, 64)
hits = append(hits, internal.Match{ID: id})
}
return &internal.SearchResult{
Total: searchResult.TotalHits(),
Total: resp.Total,
Hits: hits,
}, nil
}
func toAnySlice[T any](s []T) []any {
ret := make([]any, 0, len(s))
for _, item := range s {
ret = append(ret, item)
}
return ret
}
func parseSortBy(sortBy internal.SortBy) elastic.Sorter {
field := strings.TrimPrefix(string(sortBy), "-")
ret := elastic.NewFieldSort(field)
if strings.HasPrefix(string(sortBy), "-") {
ret.Desc()
}
return ret
func parseSortBy(sortBy internal.SortBy) es.SortField {
field, desc := strings.CutPrefix(string(sortBy), "-")
return es.SortField{Field: field, Desc: desc}
}

View File

@@ -6,6 +6,7 @@ package elasticsearch
import (
"fmt"
"net/http"
"net/url"
"os"
"testing"
"time"
@@ -17,19 +18,36 @@ import (
func TestElasticsearchIndexer(t *testing.T) {
// The elasticsearch instance started by pull-db-tests.yml > test-unit > services > elasticsearch
url := "http://elastic:changeme@elasticsearch:9200"
rawURL := "http://elastic:changeme@elasticsearch:9200"
if os.Getenv("CI") == "" {
// Make it possible to run tests against a local elasticsearch instance
url = os.Getenv("TEST_ELASTICSEARCH_URL")
if url == "" {
rawURL = os.Getenv("TEST_ELASTICSEARCH_URL")
if rawURL == "" {
t.Skip("TEST_ELASTICSEARCH_URL not set and not running in CI")
return
}
}
// Go's net/http does not auto-attach URL userinfo as Basic Auth, so extract
// it and set the header explicitly; otherwise auth-enforced clusters answer
// 401 and the probe never reports ready.
parsed, err := url.Parse(rawURL)
require.NoError(t, err)
user := parsed.User
parsed.User = nil
probeURL := parsed.String()
require.Eventually(t, func() bool {
resp, err := http.Get(url)
req, err := http.NewRequest(http.MethodGet, probeURL, nil)
if err != nil {
return false
}
if user != nil {
pass, _ := user.Password()
req.SetBasicAuth(user.Username(), pass)
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return false
}
@@ -37,7 +55,7 @@ func TestElasticsearchIndexer(t *testing.T) {
return resp.StatusCode == http.StatusOK
}, time.Minute, time.Second, "Expected elasticsearch to be up")
indexer := NewIndexer(url, fmt.Sprintf("test_elasticsearch_indexer_%d", time.Now().Unix()))
indexer := NewIndexer(rawURL, fmt.Sprintf("test_elasticsearch_indexer_%d", time.Now().Unix()))
defer indexer.Close()
tests.TestIndexer(t, indexer)

View File

@@ -116,6 +116,16 @@ var cases = []*testIndexerCase{
assert.Equal(t, len(data), int(result.Total))
},
},
{
// Exercises the single-doc Index/Delete fast path in backends that have one (e.g. Elasticsearch).
Name: "single-doc index",
ExtraData: []*internal.IndexerData{
{ID: 999, Title: "solo-issue-marker"},
},
SearchOptions: &internal.SearchOptions{Keyword: "solo-issue-marker"},
ExpectedIDs: []int64{999},
ExpectedTotal: 1,
},
{
Name: "Keyword",
ExtraData: []*internal.IndexerData{