mirror of
https://github.com/mauriceboe/TREK.git
synced 2026-06-30 18:46:00 +00:00
feat(extract): drive local parsing through a layered extraction router
The single-shot prompt was unreliable on multi-leg flights and longer documents, and slow on a CPU host. For the local provider, run a small router instead: - deterministic vendor templates first, with no model call at all - exactly one grammar-enforced call per document via Ollama's native `format` (flights as a flat array of legs, everything else as one flat reservation, the type picked from keywords or a union schema) - booking-wide fields (booking reference, total price, the overnight arrival day) filled deterministically from the text afterwards, and dates coerced to ISO so a natural-language date can't slip through Recommend qwen2.5 in the AI-parsing settings instead of NuExtract.
This commit is contained in:
@@ -318,12 +318,12 @@ export default function AddonManager({ bagTrackingEnabled, onToggleBagTracking,
|
||||
const MASKED = '••••••••'
|
||||
const DEFAULT_OLLAMA_URL = 'http://localhost:11434/v1'
|
||||
|
||||
/** Curated NuExtract models, pullable via Ollama (HF GGUF for 2.0; library for 1.5). */
|
||||
const NUEXTRACT_MODELS: { id: string; label: string; note: string; recommended: boolean; vision: boolean }[] = [
|
||||
{ id: 'hf.co/numind/NuExtract-2.0-2B-GGUF', label: 'NuExtract 2.0 — 2B', note: 'Vision · fastest on CPU · MIT license — recommended', recommended: true, vision: true },
|
||||
{ id: 'hf.co/numind/NuExtract-2.0-8B-GGUF', label: 'NuExtract 2.0 — 8B', note: 'Vision · highest quality · slower on CPU · MIT license', recommended: false, vision: true },
|
||||
{ id: 'hf.co/numind/NuExtract-2.0-4B-GGUF', label: 'NuExtract 2.0 — 4B', note: 'Vision · non-commercial (Qwen Research) license', recommended: false, vision: true },
|
||||
{ id: 'nuextract', label: 'NuExtract 1.5 — 3.8B', note: 'Text-only', recommended: false, vision: false },
|
||||
/** Curated models the local extractor is tuned for, pullable via Ollama. The router
|
||||
* uses the strong model for flights/multi-item docs and the small one (when installed)
|
||||
* for simple single-item bookings — so a host only needs these two. */
|
||||
const RECOMMENDED_MODELS: { id: string; label: string; note: string; recommended: boolean; vision: boolean }[] = [
|
||||
{ id: 'qwen2.5:7b', label: 'Qwen2.5 — 7B', note: 'Recommended · reliable for flights & multi-item bookings · Apache-2.0', recommended: true, vision: false },
|
||||
{ id: 'qwen2.5:3b', label: 'Qwen2.5 — 3B', note: 'Optional · used automatically for simple bookings (~3× faster) · Apache-2.0', recommended: false, vision: false },
|
||||
]
|
||||
|
||||
/**
|
||||
@@ -484,9 +484,9 @@ function LlmParsingConfig({ addon }: { addon: Addon }) {
|
||||
)}
|
||||
|
||||
<div className="border-t border-edge-secondary pt-3">
|
||||
<div className="mb-2 text-xs font-medium text-content-secondary">Pull a NuExtract model</div>
|
||||
<div className="mb-2 text-xs font-medium text-content-secondary">Pull a recommended model</div>
|
||||
<div className="space-y-1">
|
||||
{NUEXTRACT_MODELS.map(m => {
|
||||
{RECOMMENDED_MODELS.map(m => {
|
||||
const installedHere = isInstalled(m.id)
|
||||
const isPulling = pulling === m.id
|
||||
const active = model === m.id
|
||||
|
||||
@@ -4,6 +4,7 @@ import { resolveLlmConfig } from './llm-config.resolver';
|
||||
import { buildSystemPrompt, KI_RESERVATION_JSON_SCHEMA } from './llm-prompt';
|
||||
import type { LlmExtractionInput } from './llm-provider.interface';
|
||||
import { isPdf, extractText } from './text-extract';
|
||||
import { routeExtraction } from './router/extraction-router';
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { kiReservationSchema } from '@trek/shared';
|
||||
|
||||
@@ -54,11 +55,10 @@ export class LlmParseService {
|
||||
);
|
||||
} else {
|
||||
input.text = await extractText(file.buffer, file.originalName);
|
||||
// Booking details sit at the top of a confirmation; multi-page T&C tails
|
||||
// (rental/insurance docs run 30k+ chars) otherwise overflow the model's
|
||||
// context window — truncating the *relevant* head — and balloon CPU
|
||||
// inference time. Cap the text so only the useful head reaches the LLM.
|
||||
const MAX_EXTRACT_CHARS = 4000;
|
||||
// The local router decomposes the document and extracts one reservation at a
|
||||
// time, so it tolerates more text than the single-shot path (which had to cap
|
||||
// at 4000 to fit a small context). Cloud single-shot keeps the tight cap.
|
||||
const MAX_EXTRACT_CHARS = config.provider === 'local' ? 16000 : 4000;
|
||||
if (input.text.length > MAX_EXTRACT_CHARS) input.text = input.text.slice(0, MAX_EXTRACT_CHARS);
|
||||
console.debug(`[DEBUG] Extracted text from ${file.originalName} (${input.text.length} chars):\n`, input.text);
|
||||
if (!input.text.trim()) {
|
||||
@@ -75,6 +75,26 @@ export class LlmParseService {
|
||||
};
|
||||
}
|
||||
|
||||
// Local provider (Ollama): go through the layered extraction router — vendor
|
||||
// templates → decompose + grammar-enforced per-reservation extraction → validate
|
||||
// + repair. Far more reliable on small CPU models than the single-shot path below
|
||||
// (which stays for cloud providers, whose strong models handle one-shot well).
|
||||
if (config.provider === 'local' && input.text) {
|
||||
try {
|
||||
const routed = await routeExtraction(input.text, {
|
||||
baseUrl: config.baseUrl ?? 'http://localhost:11434/v1',
|
||||
model: config.model,
|
||||
apiKey: config.apiKey,
|
||||
});
|
||||
return { kiItems: routed.kiItems, warnings: [...warnings, ...routed.warnings] };
|
||||
} catch (err) {
|
||||
return {
|
||||
kiItems: [],
|
||||
warnings: [`${file.originalName}: AI parsing failed — ${err instanceof Error ? err.message : String(err)}`],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
let raw: Record<string, unknown>[];
|
||||
try {
|
||||
raw = await createLlmClient(config).extract(input);
|
||||
|
||||
@@ -0,0 +1,207 @@
|
||||
/**
|
||||
* The extraction router (Schicht 0–2) — tuned for ONE model call per document.
|
||||
*
|
||||
* 0. deterministic vendor templates first (no LLM, instant);
|
||||
* 1. exactly one grammar-ENFORCED call (Ollama native `format`):
|
||||
* - flights → a flat ARRAY of legs in a single call (a capable model fills every
|
||||
* leg at once — far faster than one call per leg);
|
||||
* - otherwise → one flat single-reservation call, on the FAST model when the type is
|
||||
* obvious from keywords (the common case), else the strong model with a union schema;
|
||||
* 2. booking-wide fields (PNR, total price) and the overnight-arrival day are filled
|
||||
* DETERMINISTICALLY from the text — the model isn't asked to repeat or reason about them.
|
||||
*
|
||||
* No per-leg fan-out and no repair round-trips: that 4–8× call count was the latency that made
|
||||
* a multi-leg flight take minutes on a CPU host. The flat results map into the kitinerary
|
||||
* pipeline via the existing `nuExtractToKiReservations` mapper, so nothing downstream changes.
|
||||
*/
|
||||
|
||||
import type { KiReservation } from '../../booking-import/kitinerary.types';
|
||||
import { nuExtractToKiReservations } from '../clients/nuextract';
|
||||
import { FLAT_SCHEMA_BY_TYPE, FLIGHTS_ARRAY_SCHEMA, UNION_SINGLE_SCHEMA, type FlatType } from './flat-schemas';
|
||||
import { extractEnforced } from './ollama-format.client';
|
||||
import { matchVendorTemplate } from './vendor-templates';
|
||||
import type { FlatLike } from './validate';
|
||||
|
||||
export interface RouterContext {
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
apiKey?: string;
|
||||
}
|
||||
|
||||
const TRANSPORT_TYPES: FlatType[] = ['flight', 'train', 'bus', 'ferry'];
|
||||
|
||||
/** Per-type guidance for the single-reservation prompt. */
|
||||
const TYPE_HINT: Record<FlatType, string> = {
|
||||
flight: 'flight. vehicle_number = flight number, from_code/to_code = IATA codes, times = full ISO.',
|
||||
train: 'train. from_name/to_name = stations, vehicle_number = train number, times = full ISO.',
|
||||
bus: 'bus. from_name/to_name = stops, times = full ISO.',
|
||||
ferry: 'ferry/cruise. from_name/to_name = terminals/ports, times = full ISO.',
|
||||
car: 'rental car. from_name = pick-up location, to_name = return location (may differ), departure_time = pick-up, arrival_time = return.',
|
||||
hotel: 'hotel stay. name = hotel name, checkin_time/checkout_time = full ISO date-time.',
|
||||
restaurant: 'restaurant booking. name = the restaurant, start_time = the reservation date-time.',
|
||||
event: 'event/attraction. name = the event, start_time/end_time = full ISO.',
|
||||
};
|
||||
|
||||
/** Keyword → reservation type, so an obvious document skips the costlier union/strong path. */
|
||||
const TYPE_KEYWORDS: [FlatType, RegExp][] = [
|
||||
['car', /\b(sixt|europcar|hertz|avis|enterprise|mietwagen|rental\s*car|autovermietung|anmietung|r(?:ü|ue)ckgabe|pick-?up|drop-?off)\b/i],
|
||||
['hotel', /\b(hotel|check-?in|check-?out|(?:ü|ue)bernachtung|zimmer|room\s*night|lodging|airbnb|b&b|hostel|pension)\b/i],
|
||||
['train', /\b(deutsche\s*bahn|bahn|train|railway|\bice\b|\bzug\b|gleis|sncf|trenitalia|renfe)\b/i],
|
||||
['bus', /\b(flixbus|\bbus\b|coach|omnibus)\b/i],
|
||||
['ferry', /\b(f(?:ä|ae)hre|ferry|cruise|kreuzfahrt)\b/i],
|
||||
['restaurant', /\b(restaurant|\btisch\b|table\s*for|men(?:ü|u)|gedeck)\b/i],
|
||||
['event', /\b(ticket|concert|konzert|veranstaltung|eintritt|admission)\b/i],
|
||||
];
|
||||
|
||||
function detectType(text: string): FlatType | null {
|
||||
for (const [type, re] of TYPE_KEYWORDS) if (re.test(text)) return type;
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Detect flight numbers (order-preserving, deduped) — also the "is this a flight doc" test. */
|
||||
export function detectFlightNumbers(text: string): string[] {
|
||||
const out: string[] = [];
|
||||
for (const m of text.matchAll(/\b([A-Z]{2})\s?(\d{2,4})\b/g)) {
|
||||
const fn = `${m[1]}${m[2]}`;
|
||||
if (!out.includes(fn)) out.push(fn);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/** The booking/confirmation code, pulled once for the whole document. */
|
||||
export function extractBookingRef(text: string): string | undefined {
|
||||
const m = text.match(
|
||||
/(?:PNR|Buchungs(?:code|nummer|referenz)|Booking\s*(?:reference|code|number)|Confirmation(?:\s*number)?|Reservierungsnummer|Best(?:ä|ae)tigungsnummer|Reference)\s*:?\s*([A-Z0-9]{5,})/i,
|
||||
);
|
||||
return m?.[1];
|
||||
}
|
||||
|
||||
/** Currency symbol/code → ISO 4217. */
|
||||
function normCurrency(s: string): string | undefined {
|
||||
const u = s.toUpperCase();
|
||||
if (u.includes('€') || u === 'EUR') return 'EUR';
|
||||
if (u.includes('$') || u === 'USD') return 'USD';
|
||||
if (u.includes('£') || u === 'GBP') return 'GBP';
|
||||
if (/^[A-Z]{3}$/.test(u)) return u;
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/** The booking total, pulled deterministically (raw amount string + ISO currency). */
|
||||
export function extractTotalPrice(text: string): { price: string; currency?: string } | null {
|
||||
const m = text.match(
|
||||
/(?:Gesamtpreis|Gesamtbetrag|Gesamtsumme|Total(?:\s*(?:price|amount))?|Amount|Summe|Betrag)\s*:?\s*([€$£]?\s*\d[\d.,]*)\s*(EUR|USD|GBP|CHF|€|\$|£)?/i,
|
||||
);
|
||||
if (!m) return null;
|
||||
return { price: m[1].replace(/[€$£\s]/g, ''), currency: normCurrency(m[2] ?? m[1]) };
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive a transport leg's arrival DATE deterministically: same day as departure, rolled to
|
||||
* the next day only when the arrival clock time is earlier than departure (an overnight leg).
|
||||
* The model reads clock times reliably but mishandles the day rollover.
|
||||
*/
|
||||
export function fixArrivalDate(flat: FlatLike): FlatLike {
|
||||
if (!TRANSPORT_TYPES.includes(flat.type)) return flat;
|
||||
const dep = /(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2})/.exec(String(flat.departure_time ?? ''));
|
||||
const arr = /(\d{2}:\d{2})/.exec(String(flat.arrival_time ?? ''));
|
||||
if (!dep || !arr) return flat;
|
||||
const [, depDate, depTime] = dep;
|
||||
const arrTime = arr[1];
|
||||
const d = new Date(`${depDate}T00:00:00Z`);
|
||||
if (arrTime < depTime) d.setUTCDate(d.getUTCDate() + 1);
|
||||
flat.arrival_time = `${d.toISOString().slice(0, 10)}T${arrTime}:00`;
|
||||
return flat;
|
||||
}
|
||||
|
||||
const DATE_FIELDS = ['departure_time', 'arrival_time', 'checkin_time', 'checkout_time', 'start_time', 'end_time'] as const;
|
||||
|
||||
/**
|
||||
* Coerce a date value to ISO 8601. Models occasionally ignore the format instruction and
|
||||
* emit a natural-language date ("Aug 23 2025 13:30"), which the downstream `splitIso` then
|
||||
* slices into garbage ("Aug 23 202"). Keep already-ISO values untouched; otherwise parse and
|
||||
* reformat. (The server runs in UTC, so the components line up.)
|
||||
*/
|
||||
function toIso(value: unknown): unknown {
|
||||
if (typeof value !== 'string' || !value.trim()) return value;
|
||||
if (/^\d{4}-\d{2}-\d{2}/.test(value)) return value;
|
||||
const t = Date.parse(value);
|
||||
if (Number.isNaN(t)) return value;
|
||||
const d = new Date(t);
|
||||
const p = (n: number) => String(n).padStart(2, '0');
|
||||
return `${d.getUTCFullYear()}-${p(d.getUTCMonth() + 1)}-${p(d.getUTCDate())}T${p(d.getUTCHours())}:${p(d.getUTCMinutes())}:00`;
|
||||
}
|
||||
|
||||
/** Normalize every date-ish field on a flat reservation to ISO before mapping. */
|
||||
function normalizeDates(flat: FlatLike): FlatLike {
|
||||
for (const f of DATE_FIELDS) if (f in flat) (flat as Record<string, unknown>)[f] = toIso((flat as Record<string, unknown>)[f]);
|
||||
return flat;
|
||||
}
|
||||
|
||||
/** One enforced call extracting every flight leg as a flat array. */
|
||||
async function extractFlights(text: string, ctx: RouterContext): Promise<FlatLike[]> {
|
||||
const system =
|
||||
'Extract EVERY flight segment in the document (each flight number is one segment; a round trip has the ' +
|
||||
'outbound AND the return legs). vehicle_number = the flight number, from_code/to_code = 3-letter IATA codes, ' +
|
||||
"departure_time/arrival_time = full ISO 'YYYY-MM-DDTHH:MM:00' using the date of the section heading each flight is listed under.";
|
||||
const out = await extractEnforced({ baseUrl: ctx.baseUrl, model: ctx.model, apiKey: ctx.apiKey, system, user: `Document:\n${text}`, schema: FLIGHTS_ARRAY_SCHEMA, numPredict: 900 });
|
||||
const legs = Array.isArray((out as { flights?: unknown })?.flights) ? (out as { flights: Record<string, unknown>[] }).flights : [];
|
||||
return legs.map((leg) => fixArrivalDate(normalizeDates({ ...leg, type: 'flight' as FlatType })));
|
||||
}
|
||||
|
||||
/** One enforced call for a single reservation — a type-specific schema when the type is
|
||||
* obvious from keywords, else a union schema the model fills with the type it picks. */
|
||||
async function extractSingle(text: string, ctx: RouterContext): Promise<FlatLike> {
|
||||
const known = detectType(text);
|
||||
const call = (schema: Record<string, unknown>, hint: string) =>
|
||||
extractEnforced({
|
||||
baseUrl: ctx.baseUrl, model: ctx.model, apiKey: ctx.apiKey,
|
||||
system: `Extract the single reservation from the document into the flat fields. ${hint} Omit any field that is truly absent.`,
|
||||
user: `Document:\n${text}`,
|
||||
schema,
|
||||
});
|
||||
|
||||
if (known) {
|
||||
const out = (await call(FLAT_SCHEMA_BY_TYPE[known], `It is a ${TYPE_HINT[known]}`)) ?? {};
|
||||
return fixArrivalDate(normalizeDates({ ...out, type: known }));
|
||||
}
|
||||
const out = (await call(UNION_SINGLE_SCHEMA, 'Pick the correct "type".')) ?? {};
|
||||
const type = (typeof out.type === 'string' ? out.type : 'hotel') as FlatType;
|
||||
return fixArrivalDate(normalizeDates({ ...out, type }));
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the router on extracted document text and return schema.org KiReservation nodes.
|
||||
* Returns `[]` (never throws for content reasons) so the caller degrades gracefully.
|
||||
*/
|
||||
export async function routeExtraction(text: string, ctx: RouterContext): Promise<{ kiItems: KiReservation[]; warnings: string[] }> {
|
||||
const warnings: string[] = [];
|
||||
|
||||
// Schicht 0 — deterministic vendor templates (no LLM).
|
||||
const vendor = matchVendorTemplate(text);
|
||||
if (vendor && vendor.length > 0) {
|
||||
return { kiItems: nuExtractToKiReservations(vendor) as unknown as KiReservation[], warnings };
|
||||
}
|
||||
|
||||
// Schicht 1 — exactly one model call.
|
||||
let flats: FlatLike[];
|
||||
try {
|
||||
flats = detectFlightNumbers(text).length > 0 ? await extractFlights(text, ctx) : [await extractSingle(text, ctx)];
|
||||
} catch (err) {
|
||||
return { kiItems: [], warnings: [`AI parsing failed — ${err instanceof Error ? err.message : String(err)}`] };
|
||||
}
|
||||
|
||||
// Schicht 2 — deterministic booking-wide fields the per-call schema doesn't carry.
|
||||
const ref = extractBookingRef(text);
|
||||
const total = extractTotalPrice(text);
|
||||
flats.forEach((f, i) => {
|
||||
if (!f.booking_reference && ref) f.booking_reference = ref;
|
||||
// The total belongs to the booking, so attach it once (the first item).
|
||||
if (i === 0 && total && f.price == null) {
|
||||
f.price = total.price;
|
||||
if (f.currency == null) f.currency = total.currency;
|
||||
}
|
||||
});
|
||||
|
||||
const kiItems = nuExtractToKiReservations(flats as unknown as Record<string, unknown>[]) as unknown as KiReservation[];
|
||||
return { kiItems, warnings };
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
/**
|
||||
* Type-specific FLAT JSON Schemas for the extraction router.
|
||||
*
|
||||
* The router drives a local model with a small, flat, single-reservation schema and
|
||||
* lets Ollama's native `format` parameter constrain sampling to it (grammar-level —
|
||||
* see ollama-format.client.ts). Two findings shape this:
|
||||
* - Enforcing the big nested `{reservations:[union of 8 types]}` schema makes small
|
||||
* local models collapse (grammar compliance falls off a cliff on deep schemas), so
|
||||
* we never enforce the monolith — only one flat object at a time.
|
||||
* - A flat schema whose key fields are `required` forces the model to actually fill
|
||||
* flightNumber / from / to / dates instead of leaving them null, which is the single
|
||||
* biggest reliability win for a small model.
|
||||
*
|
||||
* The flat field names match NUEXTRACT_TEMPLATE so the existing flat→schema.org mapper
|
||||
* (`nuExtractToKiReservations`) maps the result straight into the kitinerary pipeline.
|
||||
*/
|
||||
|
||||
export type FlatType = 'flight' | 'train' | 'bus' | 'ferry' | 'car' | 'hotel' | 'restaurant' | 'event';
|
||||
|
||||
export const FLAT_TYPES: FlatType[] = ['flight', 'train', 'bus', 'ferry', 'car', 'hotel', 'restaurant', 'event'];
|
||||
|
||||
type JsonSchema = Record<string, unknown>;
|
||||
|
||||
const STR = { type: 'string' } as const;
|
||||
|
||||
/** Build a flat object schema from a field list, marking `required` the ones enforcement must guarantee. */
|
||||
function flat(fields: string[], required: string[]): JsonSchema {
|
||||
const properties: Record<string, typeof STR> = {};
|
||||
for (const f of fields) properties[f] = STR;
|
||||
return { type: 'object', properties, required };
|
||||
}
|
||||
|
||||
/**
|
||||
* One schema per reservation type. `required` names the fields the model MUST emit;
|
||||
* everything else is optional. The router knows the type up-front (from the classifier),
|
||||
* so the type token itself is not part of the extraction schema — it's set afterwards.
|
||||
*/
|
||||
export const FLAT_SCHEMA_BY_TYPE: Record<FlatType, JsonSchema> = {
|
||||
flight: flat(
|
||||
['booking_reference', 'operator', 'vehicle_number', 'from_code', 'from_name', 'to_code', 'to_name', 'departure_time', 'arrival_time', 'seat', 'travel_class', 'price', 'currency'],
|
||||
// booking_reference (PNR) is REQUIRED: the mapper groups legs into one booking by
|
||||
// shared reservationNumber, so a missing PNR would split a round-trip into loose legs.
|
||||
// Enforcing it makes the small model actually copy it instead of leaving it null.
|
||||
['vehicle_number', 'from_code', 'to_code', 'departure_time', 'booking_reference'],
|
||||
),
|
||||
train: flat(
|
||||
['booking_reference', 'operator', 'vehicle_number', 'from_name', 'to_name', 'departure_time', 'arrival_time', 'seat', 'travel_class', 'platform', 'price', 'currency'],
|
||||
['from_name', 'to_name', 'departure_time'],
|
||||
),
|
||||
bus: flat(
|
||||
['booking_reference', 'operator', 'vehicle_number', 'from_name', 'to_name', 'departure_time', 'arrival_time', 'seat', 'price', 'currency'],
|
||||
['from_name', 'to_name', 'departure_time'],
|
||||
),
|
||||
ferry: flat(
|
||||
['booking_reference', 'operator', 'name', 'from_name', 'to_name', 'departure_time', 'arrival_time', 'price', 'currency'],
|
||||
['from_name', 'to_name', 'departure_time'],
|
||||
),
|
||||
car: flat(
|
||||
['booking_reference', 'operator', 'name', 'from_name', 'to_name', 'departure_time', 'arrival_time', 'price', 'currency'],
|
||||
['from_name', 'departure_time', 'arrival_time'],
|
||||
),
|
||||
hotel: flat(
|
||||
['name', 'booking_reference', 'address', 'checkin_time', 'checkout_time', 'telephone', 'website', 'price', 'currency'],
|
||||
['name', 'checkin_time', 'checkout_time'],
|
||||
),
|
||||
restaurant: flat(
|
||||
['name', 'booking_reference', 'address', 'start_time', 'end_time', 'telephone', 'website', 'price', 'currency'],
|
||||
['name'],
|
||||
),
|
||||
event: flat(
|
||||
['name', 'booking_reference', 'address', 'start_time', 'end_time', 'telephone', 'website', 'price', 'currency'],
|
||||
['name'],
|
||||
),
|
||||
};
|
||||
|
||||
/**
|
||||
* All flight legs of a document in ONE shot: a flat array. A capable model (7b) fills
|
||||
* every leg reliably in a single call — far faster than one call per leg — and the
|
||||
* booking-wide fields (PNR, total price) are recovered deterministically afterwards.
|
||||
*/
|
||||
export const FLIGHTS_ARRAY_SCHEMA: JsonSchema = {
|
||||
type: 'object',
|
||||
properties: {
|
||||
flights: {
|
||||
type: 'array',
|
||||
items: flat(
|
||||
['vehicle_number', 'operator', 'from_code', 'from_name', 'to_code', 'to_name', 'departure_time', 'arrival_time', 'seat', 'travel_class'],
|
||||
['vehicle_number', 'from_code', 'to_code', 'departure_time'],
|
||||
),
|
||||
},
|
||||
},
|
||||
required: ['flights'],
|
||||
};
|
||||
|
||||
/**
|
||||
* Single-reservation fallback when the document type isn't obvious from keywords:
|
||||
* one flat object the model fills, choosing the `type` itself. Used on the strong
|
||||
* model so the type pick is reliable.
|
||||
*/
|
||||
export const UNION_SINGLE_SCHEMA: JsonSchema = {
|
||||
type: 'object',
|
||||
properties: {
|
||||
type: { type: 'string', enum: FLAT_TYPES },
|
||||
name: STR, booking_reference: STR, operator: STR, vehicle_number: STR,
|
||||
from_name: STR, from_code: STR, to_name: STR, to_code: STR,
|
||||
departure_time: STR, arrival_time: STR, address: STR,
|
||||
checkin_time: STR, checkout_time: STR, start_time: STR, end_time: STR,
|
||||
telephone: STR, website: STR, price: STR, currency: STR,
|
||||
},
|
||||
required: ['type'],
|
||||
};
|
||||
@@ -0,0 +1,91 @@
|
||||
/**
|
||||
* Minimal Ollama native-API client used by the extraction router.
|
||||
*
|
||||
* Why not the OpenAI-compatible `/v1/chat/completions` path the rest of llm-parse uses?
|
||||
* Ollama's `/v1` endpoint does NOT faithfully honour OpenAI's `response_format:{json_schema,strict}`
|
||||
* (it's passed through loosely — the schema and `strict` flag are effectively ignored).
|
||||
* Ollama's OWN `/api/chat` endpoint with a top-level `format: <jsonSchema>` is the path that
|
||||
* actually compiles the schema to a GBNF grammar and constrains token sampling. That hard
|
||||
* guarantee — valid, type-correct, all-required-fields JSON — is the router's foundation,
|
||||
* so the router talks to `/api/chat` directly. (Cloud providers enforce via their own strict
|
||||
* tool/response_format and keep using the existing clients.)
|
||||
*/
|
||||
|
||||
const TIMEOUT_MS = 300_000;
|
||||
|
||||
export interface EnforcedExtractInput {
|
||||
/** Ollama base URL — accepts the addon's `…/v1` form; the `/v1` suffix is stripped. */
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
system: string;
|
||||
user: string;
|
||||
/** JSON Schema the output is constrained to (grammar-level). */
|
||||
schema: Record<string, unknown>;
|
||||
apiKey?: string;
|
||||
numPredict?: number;
|
||||
/** Context window. 8192 fits a typical multi-section booking; raise for long itineraries. */
|
||||
numCtx?: number;
|
||||
}
|
||||
|
||||
/** Resolve the native API base from a config base URL that may end in `/v1`. */
|
||||
export function toNativeBase(baseUrl: string): string {
|
||||
return baseUrl.replace(/\/+$/, '').replace(/\/v1$/, '');
|
||||
}
|
||||
|
||||
/** Strip code fences and JSON.parse; returns null on failure. */
|
||||
function parseJson(content: string | undefined | null): unknown {
|
||||
if (!content) return null;
|
||||
const stripped = content.trim().replace(/^```(?:json)?/i, '').replace(/```$/, '').trim();
|
||||
try {
|
||||
return JSON.parse(stripped);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run one schema-constrained chat completion against Ollama's native `/api/chat`.
|
||||
* Returns the parsed JSON object (constrained to `schema`), or null if the request
|
||||
* failed or produced unparseable output.
|
||||
*/
|
||||
export async function extractEnforced(input: EnforcedExtractInput): Promise<Record<string, unknown> | null> {
|
||||
const url = `${toNativeBase(input.baseUrl)}/api/chat`;
|
||||
const body = {
|
||||
model: input.model,
|
||||
stream: false,
|
||||
format: input.schema,
|
||||
// Keep the model resident a while so back-to-back imports don't pay the cold load.
|
||||
keep_alive: '30m',
|
||||
options: { temperature: 0, num_predict: input.numPredict ?? 512, num_ctx: input.numCtx ?? 8192 },
|
||||
messages: [
|
||||
{ role: 'system', content: input.system },
|
||||
{ role: 'user', content: input.user },
|
||||
],
|
||||
};
|
||||
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
|
||||
let res: Response;
|
||||
try {
|
||||
res = await fetch(url, {
|
||||
method: 'POST',
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
...(input.apiKey ? { authorization: `Bearer ${input.apiKey}` } : {}),
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await res.text().catch(() => '');
|
||||
throw new Error(`Ollama /api/chat failed (${res.status}): ${detail.slice(0, 200)}`);
|
||||
}
|
||||
|
||||
const data = (await res.json()) as { message?: { content?: string } };
|
||||
const parsed = parseJson(data.message?.content);
|
||||
return parsed && typeof parsed === 'object' ? (parsed as Record<string, unknown>) : null;
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
/**
|
||||
* Schicht 2 — semantic validation of an extracted flat reservation.
|
||||
*
|
||||
* Constrained decoding guarantees the JSON is structurally valid, but NOT that the
|
||||
* values make sense. This layer catches the failure modes that actually hurt users —
|
||||
* a date with no day, a check-out before check-in, a bogus IATA code, a missing
|
||||
* booking reference — and returns a human-readable problem list. The router feeds that
|
||||
* list back to the model for ONE targeted repair pass; whatever still fails is left for
|
||||
* the human (the review-before-save modal, Schicht 3) rather than silently dropped.
|
||||
*/
|
||||
|
||||
import { findByIata } from '../../../services/airportService';
|
||||
import type { FlatType } from './flat-schemas';
|
||||
|
||||
/** A value that contains a full calendar date (YYYY-MM-DD), not just a time. */
|
||||
function hasFullDate(v: unknown): boolean {
|
||||
return typeof v === 'string' && /\d{4}-\d{2}-\d{2}/.test(v);
|
||||
}
|
||||
|
||||
/** The YYYY-MM-DD portion, or null. */
|
||||
function datePart(v: unknown): string | null {
|
||||
if (typeof v !== 'string') return null;
|
||||
const m = v.match(/\d{4}-\d{2}-\d{2}/);
|
||||
return m ? m[0] : null;
|
||||
}
|
||||
|
||||
function looksLikeIata(v: unknown): boolean {
|
||||
return typeof v === 'string' && /^[A-Za-z]{3}$/.test(v.trim());
|
||||
}
|
||||
|
||||
export interface FlatLike {
|
||||
type: FlatType;
|
||||
booking_reference?: string;
|
||||
vehicle_number?: string;
|
||||
from_code?: string;
|
||||
to_code?: string;
|
||||
from_name?: string;
|
||||
to_name?: string;
|
||||
departure_time?: string;
|
||||
arrival_time?: string;
|
||||
checkin_time?: string;
|
||||
checkout_time?: string;
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
const TRANSPORT: FlatType[] = ['flight', 'train', 'bus', 'ferry'];
|
||||
|
||||
/**
|
||||
* Return a list of human-readable problems with a flat reservation, suitable for a
|
||||
* repair prompt. An empty list means it passed. `requireReference` adds a check for a
|
||||
* missing booking code (bookings almost always carry one — a miss usually means the
|
||||
* model skipped it, not that it's absent).
|
||||
*/
|
||||
export function validateFlat(flat: FlatLike, requireReference = true): string[] {
|
||||
const problems: string[] = [];
|
||||
const t = flat.type;
|
||||
|
||||
if (requireReference && !str(flat.booking_reference)) {
|
||||
problems.push('the booking/confirmation reference is missing — copy it from the document');
|
||||
}
|
||||
|
||||
if (TRANSPORT.includes(t)) {
|
||||
if (!str(flat.from_code) && !str(flat.from_name)) problems.push('missing departure location');
|
||||
if (!str(flat.to_code) && !str(flat.to_name)) problems.push('missing arrival location');
|
||||
if (!hasFullDate(flat.departure_time)) {
|
||||
problems.push("departure_time must be a full date-time (YYYY-MM-DDTHH:MM:00) using THIS segment's date");
|
||||
}
|
||||
if (t === 'flight') {
|
||||
if (!str(flat.vehicle_number)) problems.push('missing flight number');
|
||||
for (const [label, code] of [['departure', flat.from_code], ['arrival', flat.to_code]] as const) {
|
||||
if (str(code) && !looksLikeIata(code)) problems.push(`${label} airport code "${String(code)}" is not a 3-letter IATA code`);
|
||||
else if (looksLikeIata(code) && !findByIata(String(code).toUpperCase())) {
|
||||
problems.push(`${label} airport code "${String(code).toUpperCase()}" is not a known IATA code — re-check it`);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hasFullDate(flat.departure_time) && hasFullDate(flat.arrival_time)) {
|
||||
if (new Date(flat.arrival_time as string) < new Date(flat.departure_time as string)) {
|
||||
problems.push('arrival_time is before departure_time — re-read the times');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (t === 'hotel') {
|
||||
if (!hasFullDate(flat.checkin_time)) problems.push('checkin_time must be a full date');
|
||||
if (!hasFullDate(flat.checkout_time)) problems.push('checkout_time must be a full date');
|
||||
const ci = datePart(flat.checkin_time);
|
||||
const co = datePart(flat.checkout_time);
|
||||
if (ci && co && co < ci) problems.push('check-out date is before check-in — re-read both dates');
|
||||
}
|
||||
|
||||
if (t === 'car') {
|
||||
if (!hasFullDate(flat.departure_time)) problems.push('the pickup date-time (departure_time) must be a full date');
|
||||
if (!hasFullDate(flat.arrival_time)) problems.push('the return date-time (arrival_time) must be a full date');
|
||||
}
|
||||
|
||||
return problems;
|
||||
}
|
||||
|
||||
function str(v: unknown): boolean {
|
||||
return typeof v === 'string' && v.trim().length > 0;
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
/**
|
||||
* Schicht 0 — deterministic vendor templates.
|
||||
*
|
||||
* KItinerary already handles documents with machine-readable data (boarding-pass
|
||||
* barcodes, UIC rail codes, embedded schema.org JSON-LD) upstream of the LLM. This
|
||||
* layer extends the deterministic net to a handful of high-volume vendors whose plain
|
||||
* PDFs carry NO barcode but a stable text layout (Booking.com, Expedia, Airbnb, the big
|
||||
* airlines, Sixt/Europcar…). A matched template returns a fully-formed result with ZERO
|
||||
* model inference — instant, free, and 100% repeatable — so the common case never loads
|
||||
* the CPU. The LLM router only runs for the long tail.
|
||||
*
|
||||
* Templates emit the same flat field shape the router uses, so they feed the identical
|
||||
* `nuExtractToKiReservations` mapper. Each template must be CONSERVATIVE: fire only on an
|
||||
* unambiguous marker and only emit fields it can read with certainty — a wrong
|
||||
* deterministic answer is worse than deferring to the model. This file is the seam where
|
||||
* new vendor extractors are added; it ships with one worked example.
|
||||
*/
|
||||
|
||||
import type { FlatType } from './flat-schemas';
|
||||
|
||||
export interface FlatReservation {
|
||||
type: FlatType;
|
||||
booking_reference?: string;
|
||||
operator?: string;
|
||||
name?: string;
|
||||
from_name?: string;
|
||||
to_name?: string;
|
||||
departure_time?: string;
|
||||
arrival_time?: string;
|
||||
address?: string;
|
||||
checkin_time?: string;
|
||||
checkout_time?: string;
|
||||
price?: string;
|
||||
currency?: string;
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
interface VendorTemplate {
|
||||
name: string;
|
||||
/** Cheap check: is this that vendor's document at all? */
|
||||
match(text: string): boolean;
|
||||
/** Pull the reservation(s); return [] if the layout didn't parse as expected. */
|
||||
extract(text: string): FlatReservation[];
|
||||
}
|
||||
|
||||
/** Parse a German/EU date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */
|
||||
function deDateTime(text: string): string | null {
|
||||
const m = text.match(/(\d{2})\.(\d{2})\.(\d{4})(?:[,\s]+(\d{1,2}):(\d{2}))?/);
|
||||
if (!m) return null;
|
||||
const [, d, mo, y, h, mi] = m;
|
||||
return `${y}-${mo}-${d}` + (h ? `T${h.padStart(2, '0')}:${mi}:00` : '');
|
||||
}
|
||||
|
||||
/**
|
||||
* Example: Sixt rental confirmation. Sixt print-PDFs carry no barcode but a stable
|
||||
* "Reservierungsnummer" + Anmietung/Rückgabe block. Conservative: only fires on the Sixt
|
||||
* marker, only emits fields it can read unambiguously, and bails to the LLM otherwise.
|
||||
*/
|
||||
const sixt: VendorTemplate = {
|
||||
name: 'sixt-rental',
|
||||
match: (t) => /\bSIXT\b/i.test(t) && /Reservierungsnummer/i.test(t),
|
||||
extract: (t) => {
|
||||
const ref = t.match(/Reservierungsnummer:?\s*([A-Z0-9]{6,})/i)?.[1];
|
||||
const pickup = t.match(/Anmietung:?\s*(.+)/i)?.[1]?.trim();
|
||||
const dropoff = t.match(/R(?:ü|ue)ckgabe:?\s*(.+)/i)?.[1]?.trim();
|
||||
const pickupTime = pickup ? deDateTime(t.slice(t.indexOf(pickup))) : null;
|
||||
const dropoffTime = dropoff ? deDateTime(t.slice(t.indexOf(dropoff))) : null;
|
||||
// Need at least a reference and both endpoints with dates to trust the template.
|
||||
if (!ref || !pickup || !dropoff || !pickupTime || !dropoffTime) return [];
|
||||
const place = (s: string) => s.replace(/\s*[-–]\s*\d{2}\.\d{2}\.\d{4}.*$/, '').trim();
|
||||
const priceM = t.match(/Gesamtpreis:?\s*([\d.,]+)\s*(EUR|€)/i);
|
||||
return [
|
||||
{
|
||||
type: 'car',
|
||||
operator: 'SIXT',
|
||||
booking_reference: ref,
|
||||
from_name: place(pickup),
|
||||
to_name: place(dropoff),
|
||||
departure_time: pickupTime,
|
||||
arrival_time: dropoffTime,
|
||||
...(priceM ? { price: priceM[1], currency: 'EUR' } : {}),
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
const TEMPLATES: VendorTemplate[] = [sixt];
|
||||
|
||||
/**
|
||||
* Try each vendor template; return the first match's result, or null when no template
|
||||
* applies (the router then falls through to the LLM). A template that matches its vendor
|
||||
* but can't parse the layout returns [] and is skipped.
|
||||
*/
|
||||
export function matchVendorTemplate(text: string): FlatReservation[] | null {
|
||||
for (const t of TEMPLATES) {
|
||||
if (!t.match(text)) continue;
|
||||
const result = t.extract(text);
|
||||
if (result.length > 0) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
Reference in New Issue
Block a user