From ac03b7ca13958b886e8d2f33f1a7a098bd7951d1 Mon Sep 17 00:00:00 2001
From: Maurice <mauriceboe@icloud.com>
Date: Wed, 24 Jun 2026 21:20:20 +0200
Subject: [PATCH] fix(extract): make AI imports reliable and fast on local
 models

client: the import call inherited the global 8s axios timeout and aborted long LLM extractions even though the server finished it; remove the timeout. server: raise the OpenAI-compatible LLM timeout 60s->180s (a cold Ollama model can take ~45s to first token). server: cap extracted text to 8000 chars before the LLM - multi-page T&C tails (30k+ chars) overflowed the context window, truncating the relevant head and making CPU inference crawl; booking details sit at the top.
---
 client/src/api/client.ts                                    | 4 +++-
 .../src/nest/llm-parse/clients/openai-compatible.client.ts  | 4 +++-
 server/src/nest/llm-parse/llm-parse.service.ts              | 6 ++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/client/src/api/client.ts b/client/src/api/client.ts
index 9acfec4a..eed2b54d 100644
--- a/client/src/api/client.ts
+++ b/client/src/api/client.ts
@@ -664,7 +664,9 @@ export const reservationsApi = {
     const fd = new FormData()
     for (const f of files) fd.append('files', f)
     fd.append('mode', mode)
-    return apiClient.post(`/trips/${tripId}/reservations/import/booking`, fd, { headers: { 'Content-Type': 'multipart/form-data' } }).then(r => r.data)
+    // No client-side timeout: kitinerary + LLM extraction routinely exceeds the
+    // global 8s default (a cold local model alone can take ~45s).
+    return apiClient.post(`/trips/${tripId}/reservations/import/booking`, fd, { headers: { 'Content-Type': 'multipart/form-data' }, timeout: 0 }).then(r => r.data)
   },
   importBookingConfirm: (tripId: number | string, items: BookingImportPreviewItem[]): Promise<BookingImportConfirmResponse> =>
     apiClient.post(`/trips/${tripId}/reservations/import/booking/confirm`, { items }).then(r => r.data),
diff --git a/server/src/nest/llm-parse/clients/openai-compatible.client.ts b/server/src/nest/llm-parse/clients/openai-compatible.client.ts
index c7637214..376eabc4 100644
--- a/server/src/nest/llm-parse/clients/openai-compatible.client.ts
+++ b/server/src/nest/llm-parse/clients/openai-compatible.client.ts
@@ -1,6 +1,8 @@
 import type { LlmExtractionClient, LlmExtractionInput } from '../llm-provider.interface';
 
-const TIMEOUT_MS = 60_000;
+// Generous: a local model (Ollama) may cold-load several GB before its first
+// token, and longer documents push inference past a minute.
+const TIMEOUT_MS = 180_000;
 const MAX_TOKENS = 4096;
 
 /**
diff --git a/server/src/nest/llm-parse/llm-parse.service.ts b/server/src/nest/llm-parse/llm-parse.service.ts
index 42c21164..00db8b1c 100644
--- a/server/src/nest/llm-parse/llm-parse.service.ts
+++ b/server/src/nest/llm-parse/llm-parse.service.ts
@@ -54,6 +54,12 @@ export class LlmParseService {
         );
       } else {
         input.text = await extractText(file.buffer, file.originalName);
+        // Booking details sit at the top of a confirmation; multi-page T&C tails
+        // (rental/insurance docs run 30k+ chars) otherwise overflow the model's
+        // context window — truncating the *relevant* head — and balloon CPU
+        // inference time. Cap the text so only the useful head reaches the LLM.
+        const MAX_EXTRACT_CHARS = 8000;
+        if (input.text.length > MAX_EXTRACT_CHARS) input.text = input.text.slice(0, MAX_EXTRACT_CHARS);
         console.debug(`[DEBUG] Extracted text from ${file.originalName} (${input.text.length} chars):\n`, input.text);
         if (!input.text.trim()) {
           return {