From d95d26e4930db9be70d5656fc7fcaf9957eb1fdb Mon Sep 17 00:00:00 2001 From: Maurice Date: Fri, 26 Jun 2026 14:50:50 +0200 Subject: [PATCH] fix(extract): disable model thinking for grammar-constrained extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hybrid/reasoning models (Qwen3 and similar) default to emitting reasoning tokens, which collide with Ollama's format-grammar constraint — on CPU this produced null/unparseable output and blew the latency budget (qwen3:8b: null or 300s timeouts vs ~20s with thinking off). Send think:false on the /api/chat call; Ollama ignores it for non-thinking models (verified on qwen2.5:7b), so it's safe and unlocks the stronger Qwen3 family. --- server/src/nest/llm-parse/router/ollama-format.client.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/src/nest/llm-parse/router/ollama-format.client.ts b/server/src/nest/llm-parse/router/ollama-format.client.ts index de291c3f..540a54e0 100644 --- a/server/src/nest/llm-parse/router/ollama-format.client.ts +++ b/server/src/nest/llm-parse/router/ollama-format.client.ts @@ -54,6 +54,10 @@ export async function extractEnforced(input: EnforcedExtractInput): Promise