From d95d26e4930db9be70d5656fc7fcaf9957eb1fdb Mon Sep 17 00:00:00 2001
From: Maurice <mauriceboe@icloud.com>
Date: Fri, 26 Jun 2026 14:50:50 +0200
Subject: [PATCH] fix(extract): disable model thinking for grammar-constrained
 extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hybrid/reasoning models (Qwen3 and similar) default to emitting reasoning tokens, which collide with Ollama's format-grammar constraint — on CPU this produced null/unparseable output and blew the latency budget (qwen3:8b: null or 300s timeouts vs ~20s with thinking off). Send think:false on the /api/chat call; Ollama ignores it for non-thinking models (verified on qwen2.5:7b), so it's safe and unlocks the stronger Qwen3 family.
---
 server/src/nest/llm-parse/router/ollama-format.client.ts | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/server/src/nest/llm-parse/router/ollama-format.client.ts b/server/src/nest/llm-parse/router/ollama-format.client.ts
index de291c3f..540a54e0 100644
--- a/server/src/nest/llm-parse/router/ollama-format.client.ts
+++ b/server/src/nest/llm-parse/router/ollama-format.client.ts
@@ -54,6 +54,10 @@ export async function extractEnforced(input: EnforcedExtractInput): Promise<Reco
     model: input.model,
     stream: false,
     format: input.schema,
+    // Disable "thinking" for hybrid/reasoning models (Qwen3, etc.): the reasoning tokens
+    // collide with the format-grammar constraint here — they produce unparseable output and
+    // blow the latency budget on CPU. Ollama ignores this for non-thinking models, so it's safe.
+    think: false,
     // Keep the model resident a while so back-to-back imports don't pay the cold load.
     keep_alive: '30m',
     options: { temperature: 0, num_predict: input.numPredict ?? 512, num_ctx: input.numCtx ?? 8192 },