feat(cli): use a queue for duplicate and upload (#10750)

* feat(cli): use a queue for duplicate and upload

Using a queue to process the files makes the file duplicate detection and asset upload more stable and tolerant of network errors. If an error occurs, the whole command will not stop; the task will be retried (3 times) before logging the error and moving to the next step.

The new queue abstraction is using [fastq](https://www.npmjs.com/package/fastq) internally.

* chore(cli): queue.push return promise which resolve with task

* test(cli): add spec for uploadFiles and checkForDuplicates
This commit is contained in:
Simon Thiboutôt
2024-07-08 23:39:07 -04:00
committed by GitHub
parent af94f0f979
commit eb89208abb
5 changed files with 456 additions and 39 deletions

View File

@@ -16,6 +16,7 @@ import { chunk } from 'lodash-es';
import { Stats, createReadStream } from 'node:fs';
import { stat, unlink } from 'node:fs/promises';
import path, { basename } from 'node:path';
import { Queue } from 'src/queue';
import { BaseOptions, authenticate, crawl, sha1 } from 'src/utils';
const s = (count: number) => (count === 1 ? '' : 's');
@@ -83,7 +84,7 @@ const scan = async (pathsToCrawl: string[], options: UploadOptionsDto) => {
return files;
};
const checkForDuplicates = async (files: string[], { concurrency, skipHash }: UploadOptionsDto) => {
export const checkForDuplicates = async (files: string[], { concurrency, skipHash }: UploadOptionsDto) => {
if (skipHash) {
console.log('Skipping hash check, assuming all files are new');
return { newFiles: files, duplicates: [] };
@@ -99,32 +100,50 @@ const checkForDuplicates = async (files: string[], { concurrency, skipHash }: Up
const newFiles: string[] = [];
const duplicates: Asset[] = [];
try {
// TODO refactor into a queue
for (const items of chunk(files, concurrency)) {
const dto = await Promise.all(items.map(async (filepath) => ({ id: filepath, checksum: await sha1(filepath) })));
const { results } = await checkBulkUpload({ assetBulkUploadCheckDto: { assets: dto } });
for (const { id: filepath, assetId, action } of results as AssetBulkUploadCheckResults) {
const queue = new Queue<string[], AssetBulkUploadCheckResults>(
async (filepaths: string[]) => {
const dto = await Promise.all(
filepaths.map(async (filepath) => ({ id: filepath, checksum: await sha1(filepath) })),
);
const response = await checkBulkUpload({ assetBulkUploadCheckDto: { assets: dto } });
const results = response.results as AssetBulkUploadCheckResults;
for (const { id: filepath, assetId, action } of results) {
if (action === Action.Accept) {
newFiles.push(filepath);
} else {
// rejects are always duplicates
duplicates.push({ id: assetId as string, filepath });
}
progressBar.increment();
}
}
} finally {
progressBar.stop();
progressBar.increment(filepaths.length);
return results;
},
{ concurrency, retry: 3 },
);
for (const items of chunk(files, concurrency)) {
await queue.push(items);
}
await queue.drained();
progressBar.stop();
console.log(`Found ${newFiles.length} new files and ${duplicates.length} duplicate${s(duplicates.length)}`);
// Report failures
const failedTasks = queue.tasks.filter((task) => task.status === 'failed');
if (failedTasks.length > 0) {
console.log(`Failed to verify ${failedTasks.length} file${s(failedTasks.length)}:`);
for (const task of failedTasks) {
console.log(`- ${task.data} - ${task.error}`);
}
}
return { newFiles, duplicates };
};
const uploadFiles = async (files: string[], { dryRun, concurrency }: UploadOptionsDto): Promise<Asset[]> => {
export const uploadFiles = async (files: string[], { dryRun, concurrency }: UploadOptionsDto): Promise<Asset[]> => {
if (files.length === 0) {
console.log('All assets were already uploaded, nothing to do.');
return [];
@@ -158,37 +177,52 @@ const uploadFiles = async (files: string[], { dryRun, concurrency }: UploadOptio
const newAssets: Asset[] = [];
try {
for (const items of chunk(files, concurrency)) {
await Promise.all(
items.map(async (filepath) => {
const stats = statsMap.get(filepath) as Stats;
const response = await uploadFile(filepath, stats);
const queue = new Queue<string, AssetMediaResponseDto>(
async (filepath: string) => {
const stats = statsMap.get(filepath);
if (!stats) {
throw new Error(`Stats not found for ${filepath}`);
}
newAssets.push({ id: response.id, filepath });
const response = await uploadFile(filepath, stats);
newAssets.push({ id: response.id, filepath });
if (response.status === AssetMediaStatus.Duplicate) {
duplicateCount++;
duplicateSize += stats.size ?? 0;
} else {
successCount++;
successSize += stats.size ?? 0;
}
if (response.status === AssetMediaStatus.Duplicate) {
duplicateCount++;
duplicateSize += stats.size ?? 0;
} else {
successCount++;
successSize += stats.size ?? 0;
}
uploadProgress.update(successSize, { value_formatted: byteSize(successSize + duplicateSize) });
uploadProgress.update(successSize, { value_formatted: byteSize(successSize + duplicateSize) });
return response;
},
{ concurrency, retry: 3 },
);
return response;
}),
);
}
} finally {
uploadProgress.stop();
for (const filepath of files) {
await queue.push(filepath);
}
await queue.drained();
uploadProgress.stop();
console.log(`Successfully uploaded ${successCount} new asset${s(successCount)} (${byteSize(successSize)})`);
if (duplicateCount > 0) {
console.log(`Skipped ${duplicateCount} duplicate asset${s(duplicateCount)} (${byteSize(duplicateSize)})`);
}
// Report failures
const failedTasks = queue.tasks.filter((task) => task.status === 'failed');
if (failedTasks.length > 0) {
console.log(`Failed to upload ${failedTasks.length} asset${s(failedTasks.length)}:`);
for (const task of failedTasks) {
console.log(`- ${task.data} - ${task.error}`);
}
}
return newAssets;
};