From 23d5a5bd9c501db985b9053feff2965913a398ed Mon Sep 17 00:00:00 2001
From: Maurice <mauriceboe@icloud.com>
Date: Wed, 24 Jun 2026 22:44:55 +0200
Subject: [PATCH] perf(extract): cap LLM input at 4000 chars for CPU-only speed

On a GPU-less host the model's prompt-eval time scales with input length and dominates total latency. Booking details sit at the top of a confirmation, so capping the extracted text at 4000 chars (was 8000) roughly halves extraction time (~50s warm for a capable local 7B model) with no loss of fields on real hotel/rental confirmations. Tunable if a long multi-segment itinerary needs more.
---
 server/src/nest/llm-parse/llm-parse.service.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/src/nest/llm-parse/llm-parse.service.ts b/server/src/nest/llm-parse/llm-parse.service.ts
index cc3a3241..fa103691 100644
--- a/server/src/nest/llm-parse/llm-parse.service.ts
+++ b/server/src/nest/llm-parse/llm-parse.service.ts
@@ -58,7 +58,7 @@ export class LlmParseService {
         // (rental/insurance docs run 30k+ chars) otherwise overflow the model's
         // context window — truncating the *relevant* head — and balloon CPU
         // inference time. Cap the text so only the useful head reaches the LLM.
-        const MAX_EXTRACT_CHARS = 8000;
+        const MAX_EXTRACT_CHARS = 4000;
         if (input.text.length > MAX_EXTRACT_CHARS) input.text = input.text.slice(0, MAX_EXTRACT_CHARS);
         console.debug(`[DEBUG] Extracted text from ${file.originalName} (${input.text.length} chars):\n`, input.text);
         if (!input.text.trim()) {