mirror of
https://github.com/mauriceboe/TREK.git
synced 2026-06-30 18:46:00 +00:00
refactor(extract): drop vendor templates, let the model drive with deterministic backfill
Now that a capable instruct model (Qwen3-8B, thinking off) reads name/address/dates/legs reliably across formats, the per-vendor template short-circuit distorted more than it fixed: brittle on layout variations and overriding the better model output. Remove the template layer; the model extracts the structure and Schicht 2 backfills the confirmation/total and takes the currency from the document's own symbol (correcting model misreads like ¥→$). Per-type prompts now also ask for address and price/currency.
This commit is contained in:
@@ -1,25 +1,27 @@
|
||||
/**
|
||||
* The extraction router (Schicht 0–2) — tuned for ONE model call per document.
|
||||
* The extraction router — tuned for ONE model call per document.
|
||||
*
|
||||
* 0. deterministic vendor templates first (no LLM, instant);
|
||||
* 1. exactly one grammar-ENFORCED call (Ollama native `format`):
|
||||
* - flights → a flat ARRAY of legs in a single call (a capable model fills every
|
||||
* leg at once — far faster than one call per leg);
|
||||
* - otherwise → one flat single-reservation call, on the FAST model when the type is
|
||||
* obvious from keywords (the common case), else the strong model with a union schema;
|
||||
* 2. booking-wide fields (PNR, total price) and the overnight-arrival day are filled
|
||||
* DETERMINISTICALLY from the text — the model isn't asked to repeat or reason about them.
|
||||
* - otherwise → one flat single-reservation call, with a type-specific schema when the
|
||||
* type is obvious from keywords (the common case), else a union schema the model picks;
|
||||
* 2. booking-wide fields (PNR, total price, currency) and the overnight-arrival day are filled
|
||||
* DETERMINISTICALLY from the text — the model isn't asked to reason about them, and the
|
||||
* document's own currency symbol corrects the model where it misreads it.
|
||||
*
|
||||
* No per-leg fan-out and no repair round-trips: that 4–8× call count was the latency that made
|
||||
* a multi-leg flight take minutes on a CPU host. The flat results map into the kitinerary
|
||||
* pipeline via the existing `nuExtractToKiReservations` mapper, so nothing downstream changes.
|
||||
* A capable instruct model (e.g. Qwen3-8B with thinking disabled) reads name/address/dates/
|
||||
* legs reliably across formats, so there's no per-vendor template layer to drift or distort —
|
||||
* the model handles the long tail and Schicht 2 backstops the money/reference fields. No per-leg
|
||||
* fan-out and no repair round-trips: that 4–8× call count was the latency that made a multi-leg
|
||||
* flight take minutes on a CPU host. The flat results map into the kitinerary pipeline via the
|
||||
* existing `nuExtractToKiReservations` mapper, so nothing downstream changes.
|
||||
*/
|
||||
|
||||
import type { KiReservation } from '../../booking-import/kitinerary.types';
|
||||
import { nuExtractToKiReservations } from '../clients/nuextract';
|
||||
import { FLAT_SCHEMA_BY_TYPE, FLIGHTS_ARRAY_SCHEMA, UNION_SINGLE_SCHEMA, type FlatType } from './flat-schemas';
|
||||
import { extractEnforced } from './ollama-format.client';
|
||||
import { matchVendorTemplate, normCurrency } from './vendor-templates';
|
||||
import type { FlatLike } from './validate';
|
||||
|
||||
export interface RouterContext {
|
||||
@@ -30,16 +32,17 @@ export interface RouterContext {
|
||||
|
||||
const TRANSPORT_TYPES: FlatType[] = ['flight', 'train', 'bus', 'ferry'];
|
||||
|
||||
/** Per-type guidance for the single-reservation prompt. */
|
||||
/** Per-type guidance for the single-reservation prompt. `price`/`currency` are the total
|
||||
* paid and its currency on every type; `address` is the venue street address for stays/venues. */
|
||||
const TYPE_HINT: Record<FlatType, string> = {
|
||||
flight: 'flight. vehicle_number = flight number, from_code/to_code = IATA codes, times = full ISO.',
|
||||
train: 'train. from_name/to_name = stations, vehicle_number = train number, times = full ISO.',
|
||||
bus: 'bus. from_name/to_name = stops, times = full ISO.',
|
||||
ferry: 'ferry/cruise. from_name/to_name = terminals/ports, times = full ISO.',
|
||||
car: 'rental car. from_name = pick-up location, to_name = return location (may differ), departure_time = pick-up, arrival_time = return.',
|
||||
hotel: 'hotel stay. name = hotel name, checkin_time/checkout_time = full ISO date-time.',
|
||||
restaurant: 'restaurant booking. name = the restaurant, start_time = the reservation date-time.',
|
||||
event: 'event/attraction. name = the event, start_time/end_time = full ISO.',
|
||||
flight: 'flight. vehicle_number = flight number, from_code/to_code = IATA codes, times = full ISO, price/currency = total fare.',
|
||||
train: 'train. from_name/to_name = stations, vehicle_number = train number, times = full ISO, price/currency = total fare.',
|
||||
bus: 'bus. from_name/to_name = stops, times = full ISO, price/currency = total fare.',
|
||||
ferry: 'ferry/cruise. from_name/to_name = terminals/ports, times = full ISO, price/currency = total fare.',
|
||||
car: 'rental car. from_name = pick-up location, to_name = return location (may differ), departure_time = pick-up, arrival_time = return, price/currency = total rental cost.',
|
||||
hotel: 'hotel stay. name = hotel name, address = the hotel street address, checkin_time/checkout_time = full ISO date-time, price/currency = total paid.',
|
||||
restaurant: 'restaurant booking. name = the restaurant, address = its street address, start_time = the reservation date-time, price/currency = total if shown.',
|
||||
event: 'event/attraction. name = the event/ticket, address = the venue, start_time/end_time = full ISO, price/currency = ticket price.',
|
||||
};
|
||||
|
||||
/** Keyword → reservation type, so an obvious document skips the costlier union/strong path. */
|
||||
@@ -79,11 +82,21 @@ export function extractBookingRef(text: string): string | undefined {
|
||||
// do, while the case-insensitive [A-Z0-9] class would otherwise grab a following prose
|
||||
// word ("Confirmation\nThank you…" → "Thank") after a bare label.
|
||||
const m = text.match(
|
||||
/(?:PNR|Buchungs(?:code|nummer|referenz)|Booking\s*(?:reference|code|number)|Confirmation\s*(?:number|code)?|Reservierungsnummer|Reservation\s*(?:No\.?|Number|Nr\.?)|Best(?:ä|ae)tigungs[-\s]?(?:nummer|code)|Reference)\s*:?\s*((?=[A-Z0-9]*\d)[A-Z0-9]{5,})/i,
|
||||
/(?:PNR|Buchungs(?:code|nummer|referenz)|Booking\s*(?:reference|code|number)|Confirmation\s*(?:number|code)?|Reservierungsnummer|Reservation\s*(?:No\.?|Number|Nr\.?)|Best(?:ä|ae)tigungs[-\s]?(?:nummer|code)|(?:Expedia[-\s]*)?Reiseplan|Reference)\s*:?\s*((?=[A-Z0-9]*\d)[A-Z0-9]{5,})/i,
|
||||
);
|
||||
return m?.[1];
|
||||
}
|
||||
|
||||
/** Currency symbol/code → ISO 4217, or undefined when none is recognised. */
|
||||
export function normCurrency(token: string): string | undefined {
|
||||
const u = token.toUpperCase();
|
||||
if (u.includes('€')) return 'EUR';
|
||||
if (u.includes('$')) return 'USD';
|
||||
if (u.includes('£')) return 'GBP';
|
||||
if (u.includes('¥')) return 'JPY';
|
||||
return /^[A-Z]{3}$/.test(u) ? u : undefined;
|
||||
}
|
||||
|
||||
/** The booking total, pulled deterministically (raw amount string + ISO currency). */
|
||||
export function extractTotalPrice(text: string): { price: string; currency?: string } | null {
|
||||
const strip = (s: string) => s.replace(/[€$£¥\s]/g, '');
|
||||
@@ -174,11 +187,10 @@ async function extractSingle(text: string, ctx: RouterContext): Promise<FlatLike
|
||||
}
|
||||
|
||||
/**
|
||||
* Schicht 2 — fill the booking-wide fields the per-reservation extraction doesn't carry:
|
||||
* the confirmation/PNR and the booking total. Applied to BOTH the deterministic vendor
|
||||
* results AND the model output, so a vendor template that read the structured fields but
|
||||
* whose narrow ref/price regex missed still gets the broad doc-wide deterministic value.
|
||||
* Never overrides a value the source already provided.
|
||||
* Schicht 2 — fill the booking-wide fields the per-reservation model call doesn't reliably
|
||||
* carry: the confirmation/PNR and the booking total + its currency. The confirmation and a
|
||||
* missing price are filled from the document; the currency is taken from the document's own
|
||||
* symbol/code (authoritative — small models misread it), correcting the model where needed.
|
||||
*/
|
||||
function fillBookingWideFields(flats: Record<string, unknown>[], text: string): void {
|
||||
const ref = extractBookingRef(text);
|
||||
@@ -188,10 +200,12 @@ function fillBookingWideFields(flats: Record<string, unknown>[], text: string):
|
||||
const priceMissing = (v: unknown) => v == null || (typeof v === 'string' && v.trim() === '');
|
||||
flats.forEach((f, i) => {
|
||||
if (!f.booking_reference && ref) f.booking_reference = ref;
|
||||
// The total belongs to the booking, so attach it once (the first item).
|
||||
if (i === 0 && total && priceMissing(f.price)) {
|
||||
f.price = total.price;
|
||||
if (f.currency == null) f.currency = total.currency;
|
||||
// The total belongs to the booking, so handle it once (the first item).
|
||||
if (i === 0 && total) {
|
||||
if (priceMissing(f.price)) f.price = total.price;
|
||||
// The document's own currency symbol/code is authoritative; let it override the
|
||||
// model's guess (small models misread "¥" as "$").
|
||||
if (total.currency) f.currency = total.currency;
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -203,15 +217,6 @@ function fillBookingWideFields(flats: Record<string, unknown>[], text: string):
|
||||
export async function routeExtraction(text: string, ctx: RouterContext): Promise<{ kiItems: KiReservation[]; warnings: string[] }> {
|
||||
const warnings: string[] = [];
|
||||
|
||||
// Schicht 0 — deterministic vendor templates (no LLM). Still top-up the booking-wide
|
||||
// fields so a template misses on the ref/price doesn't drop them when the doc-wide
|
||||
// deterministic extractor would have found them.
|
||||
const vendor = matchVendorTemplate(text);
|
||||
if (vendor && vendor.length > 0) {
|
||||
fillBookingWideFields(vendor, text);
|
||||
return { kiItems: nuExtractToKiReservations(vendor) as unknown as KiReservation[], warnings };
|
||||
}
|
||||
|
||||
// Schicht 1 — exactly one model call.
|
||||
let flats: FlatLike[];
|
||||
try {
|
||||
|
||||
@@ -1,232 +0,0 @@
|
||||
/**
|
||||
* Schicht 0 — deterministic vendor templates.
|
||||
*
|
||||
* KItinerary already handles documents with machine-readable data (boarding-pass
|
||||
* barcodes, UIC rail codes, embedded schema.org JSON-LD) upstream of the LLM. This
|
||||
* layer extends the deterministic net to a handful of high-volume vendors whose plain
|
||||
* PDFs carry NO barcode but a stable text layout (Booking.com, Expedia, Airbnb, the big
|
||||
* airlines, Sixt/Europcar…). A matched template returns a fully-formed result with ZERO
|
||||
* model inference — instant, free, and 100% repeatable — so the common case never loads
|
||||
* the CPU. The LLM router only runs for the long tail.
|
||||
*
|
||||
* Templates emit the same flat field shape the router uses, so they feed the identical
|
||||
* `nuExtractToKiReservations` mapper. Each template must be CONSERVATIVE: fire only on an
|
||||
* unambiguous marker and only emit fields it can read with certainty — a wrong
|
||||
* deterministic answer is worse than deferring to the model. This file is the seam where
|
||||
* new vendor extractors are added; it ships with one worked example.
|
||||
*/
|
||||
|
||||
import type { FlatType } from './flat-schemas';
|
||||
|
||||
export interface FlatReservation {
|
||||
type: FlatType;
|
||||
booking_reference?: string;
|
||||
operator?: string;
|
||||
name?: string;
|
||||
from_name?: string;
|
||||
to_name?: string;
|
||||
departure_time?: string;
|
||||
arrival_time?: string;
|
||||
address?: string;
|
||||
checkin_time?: string;
|
||||
checkout_time?: string;
|
||||
price?: string;
|
||||
currency?: string;
|
||||
[k: string]: unknown;
|
||||
}
|
||||
|
||||
interface VendorTemplate {
|
||||
name: string;
|
||||
/** Cheap check: is this that vendor's document at all? */
|
||||
match(text: string): boolean;
|
||||
/** Pull the reservation(s); return [] if the layout didn't parse as expected. */
|
||||
extract(text: string): FlatReservation[];
|
||||
}
|
||||
|
||||
/** Parse a German/EU numeric date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */
|
||||
function deDateTime(text: string): string | null {
|
||||
const m = text.match(/(\d{2})\.(\d{2})\.(\d{4})(?:[,\s]+(\d{1,2}):(\d{2}))?/);
|
||||
if (!m) return null;
|
||||
const [, d, mo, y, h, mi] = m;
|
||||
return `${y}-${mo}-${d}` + (h ? `T${h.padStart(2, '0')}:${mi}:00` : '');
|
||||
}
|
||||
|
||||
/** German month name/abbreviation → month number (matched on the first three letters). */
|
||||
const DE_MONTHS: Record<string, number> = {
|
||||
jan: 1, feb: 2, 'mär': 3, mrz: 3, apr: 4, mai: 5, jun: 6, jul: 7, aug: 8, sep: 9, okt: 10, nov: 11, dez: 12,
|
||||
};
|
||||
/** English month name/abbreviation → month number (matched on the first three letters). */
|
||||
const EN_MONTHS: Record<string, number> = {
|
||||
jan: 1, feb: 2, mar: 3, apr: 4, may: 5, jun: 6, jul: 7, aug: 8, sep: 9, oct: 10, nov: 11, dec: 12,
|
||||
};
|
||||
|
||||
/** Parse a German long-form date ("3. Mai 2026", "27. Aug. 2025") to an ISO date — no time. */
|
||||
function deLongDate(text: string): string | null {
|
||||
const m = text.match(/(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s+(\d{4})/);
|
||||
if (!m) return null;
|
||||
const mo = DE_MONTHS[m[2].slice(0, 3).toLowerCase()];
|
||||
if (!mo) return null;
|
||||
return `${m[3]}-${String(mo).padStart(2, '0')}-${m[1].padStart(2, '0')}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse an English date + optional time to ISO. Tolerates a comma after the day
|
||||
* ("Aug 5, 2025") and a 12-hour clock ("Aug 23 2025 01:30 PM" → 13:30) as well as the
|
||||
* plain 24-hour form ("Aug 23 2025 13:30", "Aug 30 2025").
|
||||
*/
|
||||
function enDateTime(text: string): string | null {
|
||||
const m = text.match(/([A-Za-z]{3,})\.?\s+(\d{1,2}),?\s+(\d{4})(?:[,\s]+(\d{1,2}):(\d{2})\s*([AaPp][Mm])?)?/);
|
||||
if (!m) return null;
|
||||
const mo = EN_MONTHS[m[1].slice(0, 3).toLowerCase()];
|
||||
if (!mo) return null;
|
||||
const date = `${m[3]}-${String(mo).padStart(2, '0')}-${m[2].padStart(2, '0')}`;
|
||||
if (!m[4]) return date;
|
||||
let h = parseInt(m[4], 10);
|
||||
const meridiem = m[6]?.toLowerCase();
|
||||
if (meridiem === 'pm' && h !== 12) h += 12;
|
||||
else if (meridiem === 'am' && h === 12) h = 0;
|
||||
return `${date}T${String(h).padStart(2, '0')}:${m[5]}:00`;
|
||||
}
|
||||
|
||||
/** Symbol/code → ISO 4217, or undefined when none is recognised. */
|
||||
export function normCurrency(token: string): string | undefined {
|
||||
const u = token.toUpperCase();
|
||||
if (u.includes('€')) return 'EUR';
|
||||
if (u.includes('$')) return 'USD';
|
||||
if (u.includes('£')) return 'GBP';
|
||||
if (u.includes('¥')) return 'JPY';
|
||||
return /^[A-Z]{3}$/.test(u) ? u : undefined;
|
||||
}
|
||||
|
||||
/** Same, but defaults to EUR for the EU-centric broker vouchers. */
|
||||
function moneyCurrency(token: string | undefined): string {
|
||||
return normCurrency(token ?? '') ?? 'EUR';
|
||||
}
|
||||
|
||||
/**
|
||||
* Example: Sixt rental confirmation. Sixt print-PDFs carry no barcode but a stable
|
||||
* "Reservierungsnummer" + Anmietung/Rückgabe block. Conservative: only fires on the Sixt
|
||||
* marker, only emits fields it can read unambiguously, and bails to the LLM otherwise.
|
||||
*/
|
||||
const sixt: VendorTemplate = {
|
||||
name: 'sixt-rental',
|
||||
match: (t) => /\bSIXT\b/i.test(t) && /Reservierungsnummer/i.test(t),
|
||||
extract: (t) => {
|
||||
const ref = t.match(/Reservierungsnummer:?\s*([A-Z0-9]{6,})/i)?.[1];
|
||||
const pickup = t.match(/Anmietung:?\s*(.+)/i)?.[1]?.trim();
|
||||
const dropoff = t.match(/R(?:ü|ue)ckgabe:?\s*(.+)/i)?.[1]?.trim();
|
||||
const pickupTime = pickup ? deDateTime(t.slice(t.indexOf(pickup))) : null;
|
||||
const dropoffTime = dropoff ? deDateTime(t.slice(t.indexOf(dropoff))) : null;
|
||||
// Need at least a reference and both endpoints with dates to trust the template.
|
||||
if (!ref || !pickup || !dropoff || !pickupTime || !dropoffTime) return [];
|
||||
const place = (s: string) => s.replace(/\s*[-–]\s*\d{2}\.\d{2}\.\d{4}.*$/, '').trim();
|
||||
const priceM = t.match(/Gesamtpreis:?\s*([\d.,]+)\s*(EUR|€)/i);
|
||||
return [
|
||||
{
|
||||
type: 'car',
|
||||
operator: 'SIXT',
|
||||
booking_reference: ref,
|
||||
from_name: place(pickup),
|
||||
to_name: place(dropoff),
|
||||
departure_time: pickupTime,
|
||||
arrival_time: dropoffTime,
|
||||
...(priceM ? { price: priceM[1], currency: 'EUR' } : {}),
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Expedia receipt ("Beleg"). Expedia's German confirmation PDFs carry no barcode but a
|
||||
* stable "Buchungsdetails" block — hotel name, address, Anreise/Abreise — and an
|
||||
* "Expedia-Reiseplan" number + "Gesamtpreis". The text layer reads these cleanly even
|
||||
* when the local model misses the address/price, so pull the hotel deterministically.
|
||||
* (A combined hotel+flight receipt only yields the hotel here — the airline lines carry
|
||||
* no IATA flight number, which the model can't reliably turn into legs either.)
|
||||
*/
|
||||
const expedia: VendorTemplate = {
|
||||
name: 'expedia-hotel',
|
||||
match: (t) => /Expedia-Reiseplan/i.test(t) && /Buchungsdetails/i.test(t) && /Anreise/i.test(t),
|
||||
extract: (t) => {
|
||||
const ref = t.match(/Expedia-Reiseplan:?\s*(\d{6,})/i)?.[1];
|
||||
const block = t.match(/Buchungsdetails\s*\n([\s\S]*?)\nAnreise:/i)?.[1];
|
||||
const checkin = deLongDate(t.match(/Anreise:?\s*([^\n]+)/i)?.[1] ?? '');
|
||||
const checkout = deLongDate(t.match(/Abreise:?\s*([^\n]+)/i)?.[1] ?? '');
|
||||
if (!block || !checkin || !checkout) return [];
|
||||
const lines = block.split('\n').map((s) => s.trim()).filter(Boolean);
|
||||
const name = lines[0];
|
||||
if (!name) return [];
|
||||
const address = lines.slice(1).join(', ') || undefined;
|
||||
const priceM = t.match(/Gesamtpreis\s*([\d.,]+)\s*€/i);
|
||||
return [
|
||||
{
|
||||
type: 'hotel',
|
||||
name,
|
||||
...(ref ? { booking_reference: ref } : {}),
|
||||
...(address ? { address } : {}),
|
||||
checkin_time: checkin,
|
||||
checkout_time: checkout,
|
||||
...(priceM ? { price: priceM[1], currency: 'EUR' } : {}),
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Broker rental-car voucher (vipcars and the like). These print a stable
|
||||
* "PICK-UP DETAILS / DROP-OFF DETAILS" pair — each followed by the depot name and an
|
||||
* English "Mon DD YYYY HH:MM" line — plus a "Reservation No." and a "Payment Details"
|
||||
* total. The model regularly fails the two-column English date, so read it here.
|
||||
*/
|
||||
const brokerRental: VendorTemplate = {
|
||||
name: 'broker-rental-voucher',
|
||||
match: (t) => /PICK-?UP DETAILS/i.test(t) && /DROP-?OFF DETAILS/i.test(t) && /Reservation\s*No/i.test(t),
|
||||
extract: (t) => {
|
||||
const ref = t.match(/Reservation\s*No\.?:?\s*([A-Z0-9]{5,})/i)?.[1];
|
||||
const block = (label: RegExp) =>
|
||||
t.match(new RegExp(label.source + String.raw`\s*\n([^\n]+)\n([A-Za-z]{3,}\.?\s+\d{1,2},?\s+\d{4}[^\n]*)`, 'i'));
|
||||
const pickup = block(/PICK-?UP DETAILS/);
|
||||
const dropoff = block(/DROP-?OFF DETAILS/);
|
||||
const pickupTime = pickup ? enDateTime(pickup[2]) : null;
|
||||
const dropoffTime = dropoff ? enDateTime(dropoff[2]) : null;
|
||||
if (!ref || !pickup || !dropoff || !pickupTime || !dropoffTime) return [];
|
||||
const company = t
|
||||
.match(/SUPPLIER DETAILS\s*\n([^\n]+?)(?:\s+Supplier Reference|\n|$)/i)?.[1]
|
||||
?.trim()
|
||||
.replace(/\s*\(V\d+\)\s*$/i, ''); // drop the broker's "(V2)" supplier-version tag
|
||||
// Read the first amount in the "Payment Details" block; accept the currency on either
|
||||
// side of the number and derive it (don't assume EUR), so non-EUR vouchers still get a price.
|
||||
const priceM = t.match(
|
||||
/Payment Details[\s\S]{0,120}?(?:(EUR|USD|GBP|CHF|€|\$|£)\s*([\d.,]+)|([\d.,]+)\s*(EUR|USD|GBP|CHF|€|\$|£))/i,
|
||||
);
|
||||
const price = priceM ? priceM[2] ?? priceM[3] : undefined;
|
||||
return [
|
||||
{
|
||||
type: 'car',
|
||||
...(company ? { operator: company } : {}),
|
||||
booking_reference: ref,
|
||||
from_name: pickup[1].trim(),
|
||||
to_name: dropoff[1].trim(),
|
||||
departure_time: pickupTime,
|
||||
arrival_time: dropoffTime,
|
||||
...(price ? { price, currency: moneyCurrency(priceM![1] ?? priceM![4]) } : {}),
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
const TEMPLATES: VendorTemplate[] = [sixt, expedia, brokerRental];
|
||||
|
||||
/**
|
||||
* Try each vendor template; return the first match's result, or null when no template
|
||||
* applies (the router then falls through to the LLM). A template that matches its vendor
|
||||
* but can't parse the layout returns [] and is skipped.
|
||||
*/
|
||||
export function matchVendorTemplate(text: string): FlatReservation[] | null {
|
||||
for (const t of TEMPLATES) {
|
||||
if (!t.match(text)) continue;
|
||||
const result = t.extract(text);
|
||||
if (result.length > 0) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { extractBookingRef, extractTotalPrice, normCurrency } from '../../../../src/nest/llm-parse/router/extraction-router';
|
||||
|
||||
describe('extractBookingRef', () => {
|
||||
it('reads an Airbnb "Bestätigungs-Code"', () => {
|
||||
expect(extractBookingRef('Bestätigungs-Code\nHMHJ9RTEEK')).toBe('HMHJ9RTEEK');
|
||||
});
|
||||
it('prefers the customer "Reservation No." over a later "Supplier Reference"', () => {
|
||||
expect(extractBookingRef('Reservation No.: G72820729\nSUPPLIER DETAILS\nSupplier Reference: IT587200464')).toBe('G72820729');
|
||||
});
|
||||
it('reads an Expedia "Reiseplan" number', () => {
|
||||
expect(extractBookingRef('Expedia-Reiseplan: 73222406755286')).toBe('73222406755286');
|
||||
});
|
||||
it('reads a classic "Buchungsnummer" / "PNR"', () => {
|
||||
expect(extractBookingRef('Buchungsnummer: ABC123')).toBe('ABC123');
|
||||
expect(extractBookingRef('PNR XY7Q9Z')).toBe('XY7Q9Z');
|
||||
});
|
||||
it('does not capture a prose word after a bare "Confirmation"/"reference"', () => {
|
||||
expect(extractBookingRef('Booking Confirmation\n\nThank you for choosing us')).toBeUndefined();
|
||||
expect(extractBookingRef('For future reference please retain this email')).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('extractTotalPrice', () => {
|
||||
it('reads a labeled German total', () => {
|
||||
expect(extractTotalPrice('Gesamtpreis 61,23 €')).toEqual({ price: '61,23', currency: 'EUR' });
|
||||
});
|
||||
it('reads an Airbnb "Bezahlter Betrag"', () => {
|
||||
expect(extractTotalPrice('Bezahlter Betrag\n651,86 €')).toEqual({ price: '651,86', currency: 'EUR' });
|
||||
});
|
||||
it('falls back to a standalone ¥ voucher price (JPY) with no nearby label', () => {
|
||||
expect(extractTotalPrice('Price (consumption tax included)\n金額(消費税込)\n¥9,400\nAdult')).toEqual({ price: '9,400', currency: 'JPY' });
|
||||
});
|
||||
it('returns null when there is neither a labeled nor a symbol amount', () => {
|
||||
expect(extractTotalPrice('Just some terms and conditions, no price here.')).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('normCurrency', () => {
|
||||
it('maps symbols and codes to ISO 4217', () => {
|
||||
expect(normCurrency('€')).toBe('EUR');
|
||||
expect(normCurrency('¥')).toBe('JPY');
|
||||
expect(normCurrency('$')).toBe('USD');
|
||||
expect(normCurrency('CHF')).toBe('CHF');
|
||||
});
|
||||
it('returns undefined for an unrecognised token', () => {
|
||||
expect(normCurrency('')).toBeUndefined();
|
||||
expect(normCurrency('hello world')).toBeUndefined();
|
||||
});
|
||||
});
|
||||
@@ -1,160 +0,0 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { matchVendorTemplate } from '../../../../src/nest/llm-parse/router/vendor-templates';
|
||||
import { extractBookingRef, extractTotalPrice } from '../../../../src/nest/llm-parse/router/extraction-router';
|
||||
|
||||
// The snippets below mirror the pdf-parse text layer of real confirmation PDFs
|
||||
// (Expedia hotel receipt, Airbnb booking, a broker rental-car voucher).
|
||||
|
||||
const EXPEDIA_HOTEL = `Beleg
|
||||
Expedia-Reiseplan: 73222406755286
|
||||
Buchungsdatum: 27. Aug. 2025
|
||||
Buchungsdetails
|
||||
Mercure Tokyo Haneda Airport
|
||||
1 Chome-2-11 Haneda, Ota City, Tokyo, 144-0043 Japan
|
||||
Anreise: 3. Mai 2026
|
||||
Abreise: 22. Mai 2026
|
||||
1 Zimmer x 19 Nächte
|
||||
Zahlungsdetails
|
||||
Steuern und Gebühren 1.195,07 €
|
||||
Gesamtpreis 3.516,13 €
|
||||
Bezahlt`;
|
||||
|
||||
const AIRBNB = `Zwei-Zimmer-Wohnung zwischen Venedig und
|
||||
Treviso!
|
||||
Check-in
|
||||
15:00
|
||||
Sa., 23. Aug.
|
||||
Check-out
|
||||
10:00
|
||||
Sa., 30. Aug.
|
||||
Bestätigungs-Code
|
||||
HMHJ9RTEEK
|
||||
Adresse
|
||||
Via Aldo Moro, 47 n. 15, Quarto d'Altino, Venetien 30020, Italien
|
||||
Bezahlter Betrag
|
||||
651,86 €`;
|
||||
|
||||
const BROKER_RENTAL = `Reservation No.: G72820729
|
||||
MAIN DRIVER'S NAME: Felix Pakulat
|
||||
SUPPLIER DETAILS
|
||||
SICILY BY CAR (V2) Supplier Reference: IT587200464
|
||||
PICK-UP DETAILS
|
||||
Venice Marco Polo Airport
|
||||
Aug 23 2025 13:30
|
||||
DROP-OFF DETAILS
|
||||
Venice Marco Polo Airport
|
||||
Aug 30 2025 12:30
|
||||
Payment Details
|
||||
Amount Payable to
|
||||
Supplier:
|
||||
(Payable at Pick-up)
|
||||
EUR 300.21`;
|
||||
|
||||
describe('expedia-hotel vendor template', () => {
|
||||
it('extracts hotel name, address, stay dates, price and Reiseplan number', () => {
|
||||
const out = matchVendorTemplate(EXPEDIA_HOTEL);
|
||||
expect(out).toEqual([
|
||||
{
|
||||
type: 'hotel',
|
||||
name: 'Mercure Tokyo Haneda Airport',
|
||||
booking_reference: '73222406755286',
|
||||
address: '1 Chome-2-11 Haneda, Ota City, Tokyo, 144-0043 Japan',
|
||||
checkin_time: '2026-05-03',
|
||||
checkout_time: '2026-05-22',
|
||||
price: '3.516,13',
|
||||
currency: 'EUR',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('parses German abbreviated months (e.g. "4. Feb. 2026")', () => {
|
||||
const bnb = EXPEDIA_HOTEL.replace('Anreise: 3. Mai 2026', 'Anreise: 4. Feb. 2026').replace(
|
||||
'Abreise: 22. Mai 2026',
|
||||
'Abreise: 6. Feb. 2026',
|
||||
);
|
||||
const out = matchVendorTemplate(bnb);
|
||||
expect(out?.[0]).toMatchObject({ checkin_time: '2026-02-04', checkout_time: '2026-02-06' });
|
||||
});
|
||||
});
|
||||
|
||||
describe('broker-rental-voucher vendor template', () => {
|
||||
it('extracts pickup/return depots, English date-times, price and the customer reservation no.', () => {
|
||||
const out = matchVendorTemplate(BROKER_RENTAL);
|
||||
expect(out).toEqual([
|
||||
{
|
||||
type: 'car',
|
||||
operator: 'SICILY BY CAR', // the "(V2)" supplier-version tag is stripped
|
||||
booking_reference: 'G72820729', // the customer ref, not the supplier reference
|
||||
from_name: 'Venice Marco Polo Airport',
|
||||
to_name: 'Venice Marco Polo Airport',
|
||||
departure_time: '2025-08-23T13:30:00',
|
||||
arrival_time: '2025-08-30T12:30:00',
|
||||
price: '300.21',
|
||||
currency: 'EUR',
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('non-matching documents', () => {
|
||||
it('returns null when no template applies', () => {
|
||||
expect(matchVendorTemplate(AIRBNB)).toBeNull();
|
||||
expect(matchVendorTemplate('just some unrelated text')).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('broker template — date & price variants', () => {
|
||||
const VARIANT = `Reservation No.: AB123456
|
||||
SUPPLIER DETAILS
|
||||
GREEN MOTION Supplier Reference: XYZ
|
||||
PICK-UP DETAILS
|
||||
London Heathrow
|
||||
Aug 5, 2025 09:00 AM
|
||||
DROP-OFF DETAILS
|
||||
London Heathrow
|
||||
Aug 12, 2025 05:30 PM
|
||||
Payment Details
|
||||
Total to pay
|
||||
150.00 GBP`;
|
||||
|
||||
it('handles a comma date, a 12-hour clock and a trailing non-EUR currency', () => {
|
||||
const out = matchVendorTemplate(VARIANT);
|
||||
expect(out?.[0]).toMatchObject({
|
||||
booking_reference: 'AB123456',
|
||||
departure_time: '2025-08-05T09:00:00', // 09:00 AM
|
||||
arrival_time: '2025-08-12T17:30:00', // 05:30 PM → 17:30
|
||||
price: '150.00',
|
||||
currency: 'GBP', // derived, not hard-coded EUR
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('extractBookingRef', () => {
|
||||
it('reads an Airbnb "Bestätigungs-Code"', () => {
|
||||
expect(extractBookingRef(AIRBNB)).toBe('HMHJ9RTEEK');
|
||||
});
|
||||
it('prefers the customer "Reservation No." over a later "Supplier Reference"', () => {
|
||||
expect(extractBookingRef(BROKER_RENTAL)).toBe('G72820729');
|
||||
});
|
||||
it('still reads a classic "Buchungsnummer" / "PNR"', () => {
|
||||
expect(extractBookingRef('Buchungsnummer: ABC123')).toBe('ABC123');
|
||||
expect(extractBookingRef('PNR XY7Q9Z')).toBe('XY7Q9Z');
|
||||
});
|
||||
it('does not capture a prose word after a bare "Confirmation"/"reference"', () => {
|
||||
expect(extractBookingRef('Booking Confirmation\n\nThank you for choosing us')).toBeUndefined();
|
||||
expect(extractBookingRef('For future reference please retain this email')).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('extractTotalPrice', () => {
|
||||
it('reads an Airbnb "Bezahlter Betrag"', () => {
|
||||
expect(extractTotalPrice(AIRBNB)).toEqual({ price: '651,86', currency: 'EUR' });
|
||||
});
|
||||
it('falls back to a standalone ¥ voucher price (JPY) with no nearby label', () => {
|
||||
const voucher = 'Price (consumption tax included)\n金額(消費税込)\nPark Admission Date\n¥9,400\nAdult\n1-Day Passport';
|
||||
expect(extractTotalPrice(voucher)).toEqual({ price: '9,400', currency: 'JPY' });
|
||||
});
|
||||
it('returns null when there is neither a labeled nor a symbol amount', () => {
|
||||
expect(extractTotalPrice('Just some terms and conditions, no price here.')).toBeNull();
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user