mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-11-03 08:02:36 +09:00 
			
		
		
		
	This PR improves the accuracy of Gitea's code search. 
Currently, Gitea does not consider statements such as
`onsole.log("hello")` as hits when the user searches for `log`. The
culprit is how both ES and Bleve are tokenizing the file contents (in
both cases, `console.log` is a whole token).
In ES' case, we changed the tokenizer to
[simple_pattern_split](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simplepatternsplit-tokenizer.html#:~:text=The%20simple_pattern_split%20tokenizer%20uses%20a,the%20tokenization%20is%20generally%20faster.).
In such a case, tokens are words formed by digits and letters. In
Bleve's case, it employs a
[letter](https://blevesearch.com/docs/Tokenizers/) tokenizer.
Resolves #32220
---------
Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
		
	
		
			
				
	
	
		
			54 lines
		
	
	
		
			976 B
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			54 lines
		
	
	
		
			976 B
		
	
	
	
		
			Go
		
	
	
	
	
	
// Copyright 2024 The Gitea Authors. All rights reserved.
 | 
						|
// SPDX-License-Identifier: MIT
 | 
						|
 | 
						|
package bleve
 | 
						|
 | 
						|
import (
 | 
						|
	"fmt"
 | 
						|
	"testing"
 | 
						|
 | 
						|
	"github.com/stretchr/testify/assert"
 | 
						|
)
 | 
						|
 | 
						|
func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
 | 
						|
	scenarios := []struct {
 | 
						|
		Input     string
 | 
						|
		Fuzziness int // See util.go for the definition of fuzziness in this particular context
 | 
						|
	}{
 | 
						|
		{
 | 
						|
			Input:     "",
 | 
						|
			Fuzziness: 0,
 | 
						|
		},
 | 
						|
		{
 | 
						|
			Input:     "Avocado",
 | 
						|
			Fuzziness: 1,
 | 
						|
		},
 | 
						|
		{
 | 
						|
			Input:     "Geschwindigkeit",
 | 
						|
			Fuzziness: 2,
 | 
						|
		},
 | 
						|
		{
 | 
						|
			Input:     "non-exist",
 | 
						|
			Fuzziness: 0,
 | 
						|
		},
 | 
						|
		{
 | 
						|
			Input:     "갃갃갃",
 | 
						|
			Fuzziness: 0,
 | 
						|
		},
 | 
						|
		{
 | 
						|
			Input:     "repo1",
 | 
						|
			Fuzziness: 0,
 | 
						|
		},
 | 
						|
		{
 | 
						|
			Input:     "avocado.md",
 | 
						|
			Fuzziness: 0,
 | 
						|
		},
 | 
						|
	}
 | 
						|
 | 
						|
	for _, scenario := range scenarios {
 | 
						|
		t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
 | 
						|
			assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
 | 
						|
		})
 | 
						|
	}
 | 
						|
}
 |