mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-29 10:57:44 +09:00 
			
		
		
		
	
		
			
				
	
	
		
			481 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			481 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package enry
 | |
| 
 | |
| import (
 | |
| 	"bufio"
 | |
| 	"bytes"
 | |
| 	"path/filepath"
 | |
| 	"strings"
 | |
| 
 | |
| 	"github.com/go-enry/go-enry/v2/data"
 | |
| 	"github.com/go-enry/go-enry/v2/regex"
 | |
| )
 | |
| 
 | |
| // OtherLanguage is used as a zero value when a function can not return a specific language.
 | |
| const OtherLanguage = ""
 | |
| 
 | |
| // Strategy type fix the signature for the functions that can be used as a strategy.
 | |
| type Strategy func(filename string, content []byte, candidates []string) (languages []string)
 | |
| 
 | |
| // DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages.
 | |
| var DefaultStrategies = []Strategy{
 | |
| 	GetLanguagesByModeline,
 | |
| 	GetLanguagesByFilename,
 | |
| 	GetLanguagesByShebang,
 | |
| 	GetLanguagesByExtension,
 | |
| 	GetLanguagesByContent,
 | |
| 	GetLanguagesByClassifier,
 | |
| }
 | |
| 
 | |
| // defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
 | |
| var defaultClassifier classifier = &naiveBayes{
 | |
| 	languagesLogProbabilities: data.LanguagesLogProbabilities,
 | |
| 	tokensLogProbabilities:    data.TokensLogProbabilities,
 | |
| 	tokensTotal:               data.TokensTotal,
 | |
| }
 | |
| 
 | |
| // GetLanguage applies a sequence of strategies based on the given filename and content
 | |
| // to find out the most probably language to return.
 | |
| func GetLanguage(filename string, content []byte) (language string) {
 | |
| 	languages := GetLanguages(filename, content)
 | |
| 	return firstLanguage(languages)
 | |
| }
 | |
| 
 | |
| func firstLanguage(languages []string) string {
 | |
| 	for _, l := range languages {
 | |
| 		if l != "" {
 | |
| 			return l
 | |
| 		}
 | |
| 	}
 | |
| 	return OtherLanguage
 | |
| }
 | |
| 
 | |
| // GetLanguageByModeline returns detected language. If there are more than one possibles languages
 | |
| // it returns the first language by alphabetically order and safe to false.
 | |
| func GetLanguageByModeline(content []byte) (language string, safe bool) {
 | |
| 	return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
 | |
| }
 | |
| 
 | |
| // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
 | |
| // it returns the first language by alphabetically order and safe to false.
 | |
| func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
 | |
| 	return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
 | |
| }
 | |
| 
 | |
| // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
 | |
| // it returns the first language by alphabetically order and safe to false.
 | |
| func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
 | |
| 	return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
 | |
| }
 | |
| 
 | |
| // GetLanguageByFilename returns detected language. If there are more than one possibles languages
 | |
| // it returns the first language by alphabetically order and safe to false.
 | |
| func GetLanguageByFilename(filename string) (language string, safe bool) {
 | |
| 	return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
 | |
| }
 | |
| 
 | |
| // GetLanguageByShebang returns detected language. If there are more than one possibles languages
 | |
| // it returns the first language by alphabetically order and safe to false.
 | |
| func GetLanguageByShebang(content []byte) (language string, safe bool) {
 | |
| 	return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
 | |
| }
 | |
| 
 | |
| // GetLanguageByExtension returns detected language. If there are more than one possibles languages
 | |
| // it returns the first language by alphabetically order and safe to false.
 | |
| func GetLanguageByExtension(filename string) (language string, safe bool) {
 | |
| 	return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
 | |
| }
 | |
| 
 | |
| // GetLanguageByContent returns detected language. If there are more than one possibles languages
 | |
| // it returns the first language by alphabetically order and safe to false.
 | |
| func GetLanguageByContent(filename string, content []byte) (language string, safe bool) {
 | |
| 	return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil)
 | |
| }
 | |
| 
 | |
| // GetLanguageByClassifier returns the most probably language detected for the given content. It uses
 | |
| // defaultClassifier, if no candidates are provided it returns OtherLanguage.
 | |
| func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
 | |
| 	return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
 | |
| }
 | |
| 
 | |
| func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
 | |
| 	languages := strategy(filename, content, candidates)
 | |
| 	return getFirstLanguageAndSafe(languages)
 | |
| }
 | |
| 
 | |
| func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
 | |
| 	language = firstLanguage(languages)
 | |
| 	safe = len(languages) == 1
 | |
| 	return
 | |
| }
 | |
| 
 | |
| // getLanguageBySpecificClassifier returns the most probably language for the given content using
 | |
| // classifier to detect language.
 | |
| func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
 | |
| 	languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
 | |
| 	return getFirstLanguageAndSafe(languages)
 | |
| }
 | |
| 
 | |
| // GetLanguages applies a sequence of strategies based on the given filename and content
 | |
| // to find out the most probably languages to return.
 | |
| // At least one of arguments should be set. If content is missing, language detection will be based on the filename.
 | |
| // The function won't read the file, given an empty content.
 | |
| func GetLanguages(filename string, content []byte) []string {
 | |
| 	if IsBinary(content) {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	var languages []string
 | |
| 	candidates := []string{}
 | |
| 	for _, strategy := range DefaultStrategies {
 | |
| 		languages = strategy(filename, content, candidates)
 | |
| 		if len(languages) == 1 {
 | |
| 			return languages
 | |
| 		}
 | |
| 
 | |
| 		if len(languages) > 0 {
 | |
| 			candidates = append(candidates, languages...)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return languages
 | |
| }
 | |
| 
 | |
| // GetLanguagesByModeline returns a slice of possible languages for the given content.
 | |
| // It complies with the signature to be a Strategy type.
 | |
| func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string {
 | |
| 	headFoot := getHeaderAndFooter(content)
 | |
| 	var languages []string
 | |
| 	for _, getLang := range modelinesFunc {
 | |
| 		languages = getLang("", headFoot, candidates)
 | |
| 		if len(languages) > 0 {
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return languages
 | |
| }
 | |
| 
 | |
| var modelinesFunc = []Strategy{
 | |
| 	GetLanguagesByEmacsModeline,
 | |
| 	GetLanguagesByVimModeline,
 | |
| }
 | |
| 
 | |
| func getHeaderAndFooter(content []byte) []byte {
 | |
| 	const searchScope = 5
 | |
| 
 | |
| 	if len(content) == 0 {
 | |
| 		return content
 | |
| 	}
 | |
| 
 | |
| 	if bytes.Count(content, []byte("\n")) < 2*searchScope {
 | |
| 		return content
 | |
| 	}
 | |
| 
 | |
| 	header := headScope(content, searchScope)
 | |
| 	footer := footScope(content, searchScope)
 | |
| 	headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
 | |
| 	headerAndFooter = append(headerAndFooter, content[:header]...)
 | |
| 	headerAndFooter = append(headerAndFooter, content[footer:]...)
 | |
| 	return headerAndFooter
 | |
| }
 | |
| 
 | |
| func headScope(content []byte, scope int) (index int) {
 | |
| 	for i := 0; i < scope; i++ {
 | |
| 		eol := bytes.IndexAny(content, "\n")
 | |
| 		content = content[eol+1:]
 | |
| 		index += eol
 | |
| 	}
 | |
| 
 | |
| 	return index + scope - 1
 | |
| }
 | |
| 
 | |
| func footScope(content []byte, scope int) (index int) {
 | |
| 	for i := 0; i < scope; i++ {
 | |
| 		index = bytes.LastIndexAny(content, "\n")
 | |
| 		content = content[:index]
 | |
| 	}
 | |
| 
 | |
| 	return index + 1
 | |
| }
 | |
| 
 | |
| var (
 | |
| 	reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
 | |
| 	reEmacsLang     = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
 | |
| 	reVimModeline   = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
 | |
| 	reVimLang       = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
 | |
| )
 | |
| 
 | |
| // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
 | |
| // It complies with the signature to be a Strategy type.
 | |
| func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string {
 | |
| 	matched := reEmacsModeline.FindAllSubmatch(content, -1)
 | |
| 	if matched == nil {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	// only take the last matched line, discard previous lines
 | |
| 	lastLineMatched := matched[len(matched)-1][1]
 | |
| 	matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
 | |
| 	var alias string
 | |
| 	if matchedAlias != nil {
 | |
| 		alias = string(matchedAlias[1])
 | |
| 	} else {
 | |
| 		alias = string(lastLineMatched)
 | |
| 	}
 | |
| 
 | |
| 	language, ok := GetLanguageByAlias(alias)
 | |
| 	if !ok {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	return []string{language}
 | |
| }
 | |
| 
 | |
| // GetLanguagesByVimModeline returns a slice of possible languages for the given content.
 | |
| // It complies with the signature to be a Strategy type.
 | |
| func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string {
 | |
| 	matched := reVimModeline.FindAllSubmatch(content, -1)
 | |
| 	if matched == nil {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	// only take the last matched line, discard previous lines
 | |
| 	lastLineMatched := matched[len(matched)-1][1]
 | |
| 	matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
 | |
| 	if matchedAlias == nil {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	alias := string(matchedAlias[0][1])
 | |
| 	if len(matchedAlias) > 1 {
 | |
| 		// cases:
 | |
| 		// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
 | |
| 		// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
 | |
| 		for _, match := range matchedAlias {
 | |
| 			otherAlias := string(match[1])
 | |
| 			if otherAlias != alias {
 | |
| 				return nil
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	language, ok := GetLanguageByAlias(alias)
 | |
| 	if !ok {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	return []string{language}
 | |
| }
 | |
| 
 | |
| // GetLanguagesByFilename returns a slice of possible languages for the given filename.
 | |
| // It complies with the signature to be a Strategy type.
 | |
| func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string {
 | |
| 	if filename == "" {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	return data.LanguagesByFilename[filepath.Base(filename)]
 | |
| }
 | |
| 
 | |
| // GetLanguagesByShebang returns a slice of possible languages for the given content.
 | |
| // It complies with the signature to be a Strategy type.
 | |
| func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) {
 | |
| 	interpreter := getInterpreter(content)
 | |
| 	return data.LanguagesByInterpreter[interpreter]
 | |
| }
 | |
| 
 | |
| var (
 | |
| 	shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
 | |
| 	pythonVersion   = regex.MustCompile(`python\d\.\d+`)
 | |
| )
 | |
| 
 | |
| func getInterpreter(data []byte) (interpreter string) {
 | |
| 	line := getFirstLine(data)
 | |
| 	if !hasShebang(line) {
 | |
| 		return ""
 | |
| 	}
 | |
| 
 | |
| 	// skip shebang
 | |
| 	line = bytes.TrimSpace(line[2:])
 | |
| 	splitted := bytes.Fields(line)
 | |
| 	if len(splitted) == 0 {
 | |
| 		return ""
 | |
| 	}
 | |
| 
 | |
| 	if bytes.Contains(splitted[0], []byte("env")) {
 | |
| 		if len(splitted) > 1 {
 | |
| 			interpreter = string(splitted[1])
 | |
| 		}
 | |
| 	} else {
 | |
| 		splittedPath := bytes.Split(splitted[0], []byte{'/'})
 | |
| 		interpreter = string(splittedPath[len(splittedPath)-1])
 | |
| 	}
 | |
| 
 | |
| 	if interpreter == "sh" {
 | |
| 		interpreter = lookForMultilineExec(data)
 | |
| 	}
 | |
| 
 | |
| 	if pythonVersion.MatchString(interpreter) {
 | |
| 		interpreter = interpreter[:strings.Index(interpreter, `.`)]
 | |
| 	}
 | |
| 
 | |
| 	// If osascript is called with argument -l it could be different language so do not relay on it
 | |
| 	// To match linguist behaviour, see ref https://github.com/github/linguist/blob/d95bae794576ab0ef2fcb41a39eb61ea5302c5b5/lib/linguist/shebang.rb#L63
 | |
| 	if interpreter == "osascript" && bytes.Contains(line, []byte("-l")) {
 | |
| 		interpreter = ""
 | |
| 	}
 | |
| 
 | |
| 	return
 | |
| }
 | |
| 
 | |
| func getFirstLine(content []byte) []byte {
 | |
| 	nlpos := bytes.IndexByte(content, '\n')
 | |
| 	if nlpos < 0 {
 | |
| 		return content
 | |
| 	}
 | |
| 
 | |
| 	return content[:nlpos]
 | |
| }
 | |
| 
 | |
| func hasShebang(line []byte) bool {
 | |
| 	const shebang = `#!`
 | |
| 	prefix := []byte(shebang)
 | |
| 	return bytes.HasPrefix(line, prefix)
 | |
| }
 | |
| 
 | |
| func lookForMultilineExec(data []byte) string {
 | |
| 	const magicNumOfLines = 5
 | |
| 	interpreter := "sh"
 | |
| 
 | |
| 	buf := bufio.NewScanner(bytes.NewReader(data))
 | |
| 	for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
 | |
| 		line := buf.Bytes()
 | |
| 		if shebangExecHack.Match(line) {
 | |
| 			interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if err := buf.Err(); err != nil {
 | |
| 		return interpreter
 | |
| 	}
 | |
| 
 | |
| 	return interpreter
 | |
| }
 | |
| 
 | |
| // GetLanguagesByExtension returns a slice of possible languages for the given filename.
 | |
| // It complies with the signature to be a Strategy type.
 | |
| func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string {
 | |
| 	if !strings.Contains(filename, ".") {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	filename = strings.ToLower(filename)
 | |
| 	dots := getDotIndexes(filename)
 | |
| 	for _, dot := range dots {
 | |
| 		ext := filename[dot:]
 | |
| 		languages, ok := data.LanguagesByExtension[ext]
 | |
| 		if ok {
 | |
| 			return languages
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func getDotIndexes(filename string) []int {
 | |
| 	dots := make([]int, 0, 2)
 | |
| 	for i, letter := range filename {
 | |
| 		if letter == rune('.') {
 | |
| 			dots = append(dots, i)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return dots
 | |
| }
 | |
| 
 | |
| // GetLanguagesByContent returns a slice of languages for the given content.
 | |
| // It is a Strategy that uses content-based regexp heuristics and a filename extension.
 | |
| func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
 | |
| 	if filename == "" {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	ext := strings.ToLower(filepath.Ext(filename))
 | |
| 
 | |
| 	heuristic, ok := data.ContentHeuristics[ext]
 | |
| 	if !ok {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	return heuristic.Match(content)
 | |
| }
 | |
| 
 | |
| // GetLanguagesByClassifier returns a sorted slice of possible languages ordered by
 | |
| // decreasing language's probability. If there are not candidates it returns nil.
 | |
| // It is a Strategy that uses a pre-trained defaultClassifier.
 | |
| func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
 | |
| 	if len(candidates) == 0 {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
 | |
| }
 | |
| 
 | |
| // getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
 | |
| func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
 | |
| 	mapCandidates := make(map[string]float64)
 | |
| 	for _, candidate := range candidates {
 | |
| 		mapCandidates[candidate]++
 | |
| 	}
 | |
| 
 | |
| 	return classifier.classify(content, mapCandidates)
 | |
| }
 | |
| 
 | |
| // GetLanguageExtensions returns all extensions associated with the given language.
 | |
| func GetLanguageExtensions(language string) []string {
 | |
| 	return data.ExtensionsByLanguage[language]
 | |
| }
 | |
| 
 | |
| // Type represent language's type. Either data, programming, markup, prose, or unknown.
 | |
| type Type int
 | |
| 
 | |
| // Type's values.
 | |
| const (
 | |
| 	Unknown Type = iota
 | |
| 	Data
 | |
| 	Programming
 | |
| 	Markup
 | |
| 	Prose
 | |
| )
 | |
| 
 | |
| // GetLanguageType returns the type of the given language.
 | |
| func GetLanguageType(language string) (langType Type) {
 | |
| 	intType, ok := data.LanguagesType[language]
 | |
| 	langType = Type(intType)
 | |
| 	if !ok {
 | |
| 		langType = Unknown
 | |
| 	}
 | |
| 	return langType
 | |
| }
 | |
| 
 | |
| // GetLanguageByAlias returns either the language related to the given alias and ok set to true
 | |
| // or Otherlanguage and ok set to false if the alias is not recognized.
 | |
| func GetLanguageByAlias(alias string) (lang string, ok bool) {
 | |
| 	lang, ok = data.LanguageByAlias(alias)
 | |
| 	if !ok {
 | |
| 		lang = OtherLanguage
 | |
| 	}
 | |
| 
 | |
| 	return
 | |
| }
 | |
| 
 | |
| // GetLanguageGroup returns language group or empty string if language does not have group.
 | |
| func GetLanguageGroup(language string) string {
 | |
| 	if group, ok := data.LanguagesGroup[language]; ok {
 | |
| 		return group
 | |
| 	}
 | |
| 
 | |
| 	return ""
 | |
| }
 |