mirror of
https://github.com/mauriceboe/TREK.git
synced 2026-06-30 18:46:00 +00:00
fix(extract): make AI imports reliable and fast on local models
client: the import call inherited the global 8s axios timeout and aborted long LLM extractions even though the server finished it; remove the timeout. server: raise the OpenAI-compatible LLM timeout 60s->180s (a cold Ollama model can take ~45s to first token). server: cap extracted text to 8000 chars before the LLM - multi-page T&C tails (30k+ chars) overflowed the context window, truncating the relevant head and making CPU inference crawl; booking details sit at the top.
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
import type { LlmExtractionClient, LlmExtractionInput } from '../llm-provider.interface';
|
||||
|
||||
const TIMEOUT_MS = 60_000;
|
||||
// Generous: a local model (Ollama) may cold-load several GB before its first
|
||||
// token, and longer documents push inference past a minute.
|
||||
const TIMEOUT_MS = 180_000;
|
||||
const MAX_TOKENS = 4096;
|
||||
|
||||
/**
|
||||
|
||||
@@ -54,6 +54,12 @@ export class LlmParseService {
|
||||
);
|
||||
} else {
|
||||
input.text = await extractText(file.buffer, file.originalName);
|
||||
// Booking details sit at the top of a confirmation; multi-page T&C tails
|
||||
// (rental/insurance docs run 30k+ chars) otherwise overflow the model's
|
||||
// context window — truncating the *relevant* head — and balloon CPU
|
||||
// inference time. Cap the text so only the useful head reaches the LLM.
|
||||
const MAX_EXTRACT_CHARS = 8000;
|
||||
if (input.text.length > MAX_EXTRACT_CHARS) input.text = input.text.slice(0, MAX_EXTRACT_CHARS);
|
||||
console.debug(`[DEBUG] Extracted text from ${file.originalName} (${input.text.length} chars):\n`, input.text);
|
||||
if (!input.text.trim()) {
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user