fix(server): cjk migration (#24320)
Some checks failed
CodeQL / Analyze (javascript) (push) Has been cancelled
CodeQL / Analyze (python) (push) Has been cancelled
Docker / pre-job (push) Has been cancelled
Docker / Re-Tag ML () (push) Has been cancelled
Docker / Re-Tag ML (-armnn) (push) Has been cancelled
Docker / Re-Tag ML (-cuda) (push) Has been cancelled
Docker / Re-Tag ML (-openvino) (push) Has been cancelled
Docker / Re-Tag ML (-rknn) (push) Has been cancelled
Docker / Re-Tag ML (-rocm) (push) Has been cancelled
Docker / Re-Tag Server () (push) Has been cancelled
Docker / Build and Push ML (armnn, linux/arm64, -armnn) (push) Has been cancelled
Docker / Build and Push ML (cpu) (push) Has been cancelled
Docker / Build and Push ML (cuda, linux/amd64, -cuda) (push) Has been cancelled
Docker / Build and Push ML (openvino, linux/amd64, -openvino) (push) Has been cancelled
Docker / Build and Push ML (rknn, linux/arm64, -rknn) (push) Has been cancelled
Docker / Build and Push ML (rocm, linux/amd64, {"linux/amd64": "mich"}, -rocm) (push) Has been cancelled
Docker / Build and Push Server (push) Has been cancelled
Docker / Docker Build & Push Server Success (push) Has been cancelled
Docker / Docker Build & Push ML Success (push) Has been cancelled
Docs build / pre-job (push) Has been cancelled
Docs build / Docs Build (push) Has been cancelled
Zizmor / Zizmor (push) Has been cancelled
Manage release PR / bump (push) Has been cancelled
Static Code Analysis / pre-job (push) Has been cancelled
Static Code Analysis / Run Dart Code Analysis (push) Has been cancelled
Test / pre-job (push) Has been cancelled
Test / Test & Lint Server (push) Has been cancelled
Test / Unit Test CLI (push) Has been cancelled
Test / Unit Test CLI (Windows) (push) Has been cancelled
Test / Lint Web (push) Has been cancelled
Test / Test Web (push) Has been cancelled
Test / Test i18n (push) Has been cancelled
Test / End-to-End Lint (push) Has been cancelled
Test / Medium Tests (Server) (push) Has been cancelled
Test / End-to-End Tests (Server & CLI) (ubuntu-24.04-arm) (push) Has been cancelled
Test / End-to-End Tests (Server & CLI) (ubuntu-latest) (push) Has been cancelled
Test / End-to-End Tests (Web) (ubuntu-24.04-arm) (push) Has been cancelled
Test / End-to-End Tests (Web) (ubuntu-latest) (push) Has been cancelled
Test / End-to-End Tests Success (push) Has been cancelled
Test / Unit Test Mobile (push) Has been cancelled
Test / Unit Test ML (push) Has been cancelled
Test / .github Files Formatting (push) Has been cancelled
Test / ShellCheck (push) Has been cancelled
Test / OpenAPI Clients (push) Has been cancelled
Test / SQL Schema Checks (push) Has been cancelled

* join string

* use pagination instead
This commit is contained in:
Mert
2025-12-01 16:41:19 -05:00
committed by GitHub
parent 95c29a8aea
commit 7c19b0591f

View File

@@ -3,21 +3,28 @@ import { tokenizeForSearch } from 'src/utils/database';
export async function up(db: Kysely<any>): Promise<void> {
await sql`truncate ${sql.table('ocr_search')}`.execute(db);
const batch = [];
for await (const { assetId, text } of db
.selectFrom('asset_ocr')
.select(['assetId', sql<string>`string_agg(text, ' ')`.as('text')])
.groupBy('assetId')
.stream()) {
batch.push({ assetId, text: tokenizeForSearch(text) });
if (batch.length >= 5000) {
await db.insertInto('ocr_search').values(batch).execute();
batch.length = 0;
}
}
if (batch.length > 0) {
await db.insertInto('ocr_search').values(batch).execute();
let lastAssetId: string | undefined;
while (true) {
const rows = await db
.selectFrom('asset_ocr')
.select(['assetId', sql<string>`string_agg(text, ' ')`.as('text')])
.$if(lastAssetId !== undefined, (qb) => qb.where('assetId', '>', lastAssetId))
.groupBy('assetId')
.orderBy('assetId')
.limit(5000)
.execute();
if (rows.length === 0) {
break;
}
await db
.insertInto('ocr_search')
.values(rows.map(({ assetId, text }) => ({ assetId, text: tokenizeForSearch(text).join(' ') })))
.execute();
lastAssetId = rows.at(-1)!.assetId;
}
}