From ac03b7ca13958b886e8d2f33f1a7a098bd7951d1 Mon Sep 17 00:00:00 2001 From: Maurice Date: Wed, 24 Jun 2026 21:20:20 +0200 Subject: [PATCH] fix(extract): make AI imports reliable and fast on local models client: the import call inherited the global 8s axios timeout and aborted long LLM extractions even though the server finished it; remove the timeout. server: raise the OpenAI-compatible LLM timeout 60s->180s (a cold Ollama model can take ~45s to first token). server: cap extracted text to 8000 chars before the LLM - multi-page T&C tails (30k+ chars) overflowed the context window, truncating the relevant head and making CPU inference crawl; booking details sit at the top. --- client/src/api/client.ts | 4 +++- .../src/nest/llm-parse/clients/openai-compatible.client.ts | 4 +++- server/src/nest/llm-parse/llm-parse.service.ts | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/client/src/api/client.ts b/client/src/api/client.ts index 9acfec4a..eed2b54d 100644 --- a/client/src/api/client.ts +++ b/client/src/api/client.ts @@ -664,7 +664,9 @@ export const reservationsApi = { const fd = new FormData() for (const f of files) fd.append('files', f) fd.append('mode', mode) - return apiClient.post(`/trips/${tripId}/reservations/import/booking`, fd, { headers: { 'Content-Type': 'multipart/form-data' } }).then(r => r.data) + // No client-side timeout: kitinerary + LLM extraction routinely exceeds the + // global 8s default (a cold local model alone can take ~45s). + return apiClient.post(`/trips/${tripId}/reservations/import/booking`, fd, { headers: { 'Content-Type': 'multipart/form-data' }, timeout: 0 }).then(r => r.data) }, importBookingConfirm: (tripId: number | string, items: BookingImportPreviewItem[]): Promise => apiClient.post(`/trips/${tripId}/reservations/import/booking/confirm`, { items }).then(r => r.data), diff --git a/server/src/nest/llm-parse/clients/openai-compatible.client.ts b/server/src/nest/llm-parse/clients/openai-compatible.client.ts index c7637214..376eabc4 100644 --- a/server/src/nest/llm-parse/clients/openai-compatible.client.ts +++ b/server/src/nest/llm-parse/clients/openai-compatible.client.ts @@ -1,6 +1,8 @@ import type { LlmExtractionClient, LlmExtractionInput } from '../llm-provider.interface'; -const TIMEOUT_MS = 60_000; +// Generous: a local model (Ollama) may cold-load several GB before its first +// token, and longer documents push inference past a minute. +const TIMEOUT_MS = 180_000; const MAX_TOKENS = 4096; /** diff --git a/server/src/nest/llm-parse/llm-parse.service.ts b/server/src/nest/llm-parse/llm-parse.service.ts index 42c21164..00db8b1c 100644 --- a/server/src/nest/llm-parse/llm-parse.service.ts +++ b/server/src/nest/llm-parse/llm-parse.service.ts @@ -54,6 +54,12 @@ export class LlmParseService { ); } else { input.text = await extractText(file.buffer, file.originalName); + // Booking details sit at the top of a confirmation; multi-page T&C tails + // (rental/insurance docs run 30k+ chars) otherwise overflow the model's + // context window — truncating the *relevant* head — and balloon CPU + // inference time. Cap the text so only the useful head reaches the LLM. + const MAX_EXTRACT_CHARS = 8000; + if (input.text.length > MAX_EXTRACT_CHARS) input.text = input.text.slice(0, MAX_EXTRACT_CHARS); console.debug(`[DEBUG] Extracted text from ${file.originalName} (${input.text.length} chars):\n`, input.text); if (!input.text.trim()) { return {