From 23d5a5bd9c501db985b9053feff2965913a398ed Mon Sep 17 00:00:00 2001 From: Maurice Date: Wed, 24 Jun 2026 22:44:55 +0200 Subject: [PATCH] perf(extract): cap LLM input at 4000 chars for CPU-only speed On a GPU-less host the model's prompt-eval time scales with input length and dominates total latency. Booking details sit at the top of a confirmation, so capping the extracted text at 4000 chars (was 8000) roughly halves extraction time (~50s warm for a capable local 7B model) with no loss of fields on real hotel/rental confirmations. Tunable if a long multi-segment itinerary needs more. --- server/src/nest/llm-parse/llm-parse.service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/nest/llm-parse/llm-parse.service.ts b/server/src/nest/llm-parse/llm-parse.service.ts index cc3a3241..fa103691 100644 --- a/server/src/nest/llm-parse/llm-parse.service.ts +++ b/server/src/nest/llm-parse/llm-parse.service.ts @@ -58,7 +58,7 @@ export class LlmParseService { // (rental/insurance docs run 30k+ chars) otherwise overflow the model's // context window — truncating the *relevant* head — and balloon CPU // inference time. Cap the text so only the useful head reaches the LLM. - const MAX_EXTRACT_CHARS = 8000; + const MAX_EXTRACT_CHARS = 4000; if (input.text.length > MAX_EXTRACT_CHARS) input.text = input.text.slice(0, MAX_EXTRACT_CHARS); console.debug(`[DEBUG] Extracted text from ${file.originalName} (${input.text.length} chars):\n`, input.text); if (!input.text.trim()) {