mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-31 21:28:11 +09:00 
			
		
		
		
	Use cat-file --batch in GetLanguageStats (#14685)
* Use cat-file --batch in GetLanguageStats This PR moves to using a single cat-file --batch in GetLanguageStats significantly reducing the number of processes spawned during language stat processing. Signed-off-by: Andrew Thornton <art27@cantab.net> * placate lint Signed-off-by: Andrew Thornton <art27@cantab.net> * Update modules/git/repo_language_stats_nogogit.go Co-authored-by: a1012112796 <1012112796@qq.com> Co-authored-by: Lauris BH <lauris@nix.lv> Co-authored-by: 6543 <6543@obermui.de> Co-authored-by: a1012112796 <1012112796@qq.com> Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
This commit is contained in:
		| @@ -7,9 +7,11 @@ | ||||
| package git | ||||
|  | ||||
| import ( | ||||
| 	"bufio" | ||||
| 	"bytes" | ||||
| 	"io" | ||||
| 	"io/ioutil" | ||||
| 	"math" | ||||
| 	"strings" | ||||
|  | ||||
| 	"code.gitea.io/gitea/modules/analyze" | ||||
|  | ||||
| @@ -18,16 +20,60 @@ import ( | ||||
|  | ||||
| // GetLanguageStats calculates language stats for git repository at specified commit | ||||
| func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { | ||||
| 	// FIXME: We can be more efficient here... | ||||
| 	// | ||||
| 	// We're expecting that we will be reading a lot of blobs and the trees | ||||
| 	// Thus we should use a shared `cat-file --batch` to get all of this data | ||||
| 	// And keep the buffers around with resets as necessary. | ||||
| 	// | ||||
| 	// It's more complicated so... | ||||
| 	commit, err := repo.GetCommit(commitID) | ||||
| 	// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary. | ||||
| 	// so let's create a batch stdin and stdout | ||||
|  | ||||
| 	batchStdinReader, batchStdinWriter := io.Pipe() | ||||
| 	batchStdoutReader, batchStdoutWriter := io.Pipe() | ||||
| 	defer func() { | ||||
| 		_ = batchStdinReader.Close() | ||||
| 		_ = batchStdinWriter.Close() | ||||
| 		_ = batchStdoutReader.Close() | ||||
| 		_ = batchStdoutWriter.Close() | ||||
| 	}() | ||||
|  | ||||
| 	go func() { | ||||
| 		stderr := strings.Builder{} | ||||
| 		err := NewCommand("cat-file", "--batch").RunInDirFullPipeline(repo.Path, batchStdoutWriter, &stderr, batchStdinReader) | ||||
| 		if err != nil { | ||||
| 			_ = batchStdoutWriter.CloseWithError(ConcatenateError(err, (&stderr).String())) | ||||
| 			_ = batchStdinReader.CloseWithError(ConcatenateError(err, (&stderr).String())) | ||||
| 		} else { | ||||
| 			_ = batchStdoutWriter.Close() | ||||
| 			_ = batchStdinReader.Close() | ||||
| 		} | ||||
| 	}() | ||||
|  | ||||
| 	// For simplicities sake we'll us a buffered reader | ||||
| 	batchReader := bufio.NewReader(batchStdoutReader) | ||||
|  | ||||
| 	writeID := func(id string) error { | ||||
| 		_, err := batchStdinWriter.Write([]byte(id)) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 		_, err = batchStdinWriter.Write([]byte{'\n'}) | ||||
| 		return err | ||||
| 	} | ||||
|  | ||||
| 	if err := writeID(commitID); err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| 	shaBytes, typ, size, err := ReadBatchLine(batchReader) | ||||
| 	if typ != "commit" { | ||||
| 		log("Unable to get commit for: %s. Err: %v", commitID, err) | ||||
| 		return nil, ErrNotExist{commitID, ""} | ||||
| 	} | ||||
|  | ||||
| 	sha, err := NewIDFromString(string(shaBytes)) | ||||
| 	if err != nil { | ||||
| 		log("Unable to get commit for: %s", commitID) | ||||
| 		log("Unable to get commit for: %s. Err: %v", commitID, err) | ||||
| 		return nil, ErrNotExist{commitID, ""} | ||||
| 	} | ||||
|  | ||||
| 	commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size)) | ||||
| 	if err != nil { | ||||
| 		log("Unable to get commit for: %s. Err: %v", commitID, err) | ||||
| 		return nil, err | ||||
| 	} | ||||
|  | ||||
| @@ -38,17 +84,45 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | ||||
| 		return nil, err | ||||
| 	} | ||||
|  | ||||
| 	contentBuf := bytes.Buffer{} | ||||
| 	var content []byte | ||||
| 	sizes := make(map[string]int64) | ||||
| 	for _, f := range entries { | ||||
| 		contentBuf.Reset() | ||||
| 		content = contentBuf.Bytes() | ||||
| 		if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) || | ||||
| 			enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) { | ||||
| 			continue | ||||
| 		} | ||||
|  | ||||
| 		// If content can not be read or file is too big just do detection by filename | ||||
| 		var content []byte | ||||
|  | ||||
| 		if f.Size() <= bigFileSize { | ||||
| 			content, _ = readFile(f, fileSizeLimit) | ||||
| 			if err := writeID(f.ID.String()); err != nil { | ||||
| 				return nil, err | ||||
| 			} | ||||
| 			_, _, size, err := ReadBatchLine(batchReader) | ||||
| 			if err != nil { | ||||
| 				log("Error reading blob: %s Err: %v", f.ID.String(), err) | ||||
| 				return nil, err | ||||
| 			} | ||||
|  | ||||
| 			sizeToRead := size | ||||
| 			discard := int64(0) | ||||
| 			if size > fileSizeLimit { | ||||
| 				sizeToRead = fileSizeLimit | ||||
| 				discard = size - fileSizeLimit | ||||
| 			} | ||||
|  | ||||
| 			_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead)) | ||||
| 			if err != nil { | ||||
| 				return nil, err | ||||
| 			} | ||||
| 			content = contentBuf.Bytes() | ||||
| 			err = discardFull(batchReader, discard) | ||||
| 			if err != nil { | ||||
| 				return nil, err | ||||
| 			} | ||||
| 		} | ||||
| 		if enry.IsGenerated(f.Name(), content) { | ||||
| 			continue | ||||
| @@ -86,24 +160,20 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | ||||
| 	return sizes, nil | ||||
| } | ||||
|  | ||||
| func readFile(entry *TreeEntry, limit int64) ([]byte, error) { | ||||
| 	// FIXME: We can probably be a little more efficient here... see above | ||||
| 	r, err := entry.Blob().DataAsync() | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| func discardFull(rd *bufio.Reader, discard int64) error { | ||||
| 	if discard > math.MaxInt32 { | ||||
| 		n, err := rd.Discard(math.MaxInt32) | ||||
| 		discard -= int64(n) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 	} | ||||
| 	defer r.Close() | ||||
|  | ||||
| 	if limit <= 0 { | ||||
| 		return ioutil.ReadAll(r) | ||||
| 	for discard > 0 { | ||||
| 		n, err := rd.Discard(int(discard)) | ||||
| 		discard -= int64(n) | ||||
| 		if err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	size := entry.Size() | ||||
| 	if limit > 0 && size > limit { | ||||
| 		size = limit | ||||
| 	} | ||||
| 	buf := bytes.NewBuffer(nil) | ||||
| 	buf.Grow(int(size)) | ||||
| 	_, err = io.Copy(buf, io.LimitReader(r, limit)) | ||||
| 	return buf.Bytes(), err | ||||
| 	return nil | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user