refactor(extract): drop vendor templates, let the model drive with deterministic backfill

Now that a capable instruct model (Qwen3-8B, thinking off) reads name/address/dates/legs reliably across formats, the per-vendor template short-circuit distorted more than it fixed: brittle on layout variations and overriding the better model output. Remove the template layer; the model extracts the structure and Schicht 2 backfills the confirmation/total and takes the currency from the document's own symbol (correcting model misreads like ¥→$). Per-type prompts now also ask for address and price/currency.
This commit is contained in:
Maurice
2026-06-26 15:42:21 +02:00
parent 51e8524d5c
commit 13f342e446
4 changed files with 93 additions and 430 deletions
@@ -1,25 +1,27 @@
/**
* The extraction router (Schicht 02) — tuned for ONE model call per document.
* The extraction router — tuned for ONE model call per document.
*
* 0. deterministic vendor templates first (no LLM, instant);
* 1. exactly one grammar-ENFORCED call (Ollama native `format`):
* - flights → a flat ARRAY of legs in a single call (a capable model fills every
* leg at once — far faster than one call per leg);
* - otherwise → one flat single-reservation call, on the FAST model when the type is
* obvious from keywords (the common case), else the strong model with a union schema;
* 2. booking-wide fields (PNR, total price) and the overnight-arrival day are filled
* DETERMINISTICALLY from the text — the model isn't asked to repeat or reason about them.
* - otherwise → one flat single-reservation call, with a type-specific schema when the
* type is obvious from keywords (the common case), else a union schema the model picks;
* 2. booking-wide fields (PNR, total price, currency) and the overnight-arrival day are filled
* DETERMINISTICALLY from the text — the model isn't asked to reason about them, and the
* document's own currency symbol corrects the model where it misreads it.
*
* No per-leg fan-out and no repair round-trips: that 48× call count was the latency that made
* a multi-leg flight take minutes on a CPU host. The flat results map into the kitinerary
* pipeline via the existing `nuExtractToKiReservations` mapper, so nothing downstream changes.
* A capable instruct model (e.g. Qwen3-8B with thinking disabled) reads name/address/dates/
* legs reliably across formats, so there's no per-vendor template layer to drift or distort —
* the model handles the long tail and Schicht 2 backstops the money/reference fields. No per-leg
* fan-out and no repair round-trips: that 48× call count was the latency that made a multi-leg
* flight take minutes on a CPU host. The flat results map into the kitinerary pipeline via the
* existing `nuExtractToKiReservations` mapper, so nothing downstream changes.
*/
import type { KiReservation } from '../../booking-import/kitinerary.types';
import { nuExtractToKiReservations } from '../clients/nuextract';
import { FLAT_SCHEMA_BY_TYPE, FLIGHTS_ARRAY_SCHEMA, UNION_SINGLE_SCHEMA, type FlatType } from './flat-schemas';
import { extractEnforced } from './ollama-format.client';
import { matchVendorTemplate, normCurrency } from './vendor-templates';
import type { FlatLike } from './validate';
export interface RouterContext {
@@ -30,16 +32,17 @@ export interface RouterContext {
const TRANSPORT_TYPES: FlatType[] = ['flight', 'train', 'bus', 'ferry'];
/** Per-type guidance for the single-reservation prompt. */
/** Per-type guidance for the single-reservation prompt. `price`/`currency` are the total
* paid and its currency on every type; `address` is the venue street address for stays/venues. */
const TYPE_HINT: Record<FlatType, string> = {
flight: 'flight. vehicle_number = flight number, from_code/to_code = IATA codes, times = full ISO.',
train: 'train. from_name/to_name = stations, vehicle_number = train number, times = full ISO.',
bus: 'bus. from_name/to_name = stops, times = full ISO.',
ferry: 'ferry/cruise. from_name/to_name = terminals/ports, times = full ISO.',
car: 'rental car. from_name = pick-up location, to_name = return location (may differ), departure_time = pick-up, arrival_time = return.',
hotel: 'hotel stay. name = hotel name, checkin_time/checkout_time = full ISO date-time.',
restaurant: 'restaurant booking. name = the restaurant, start_time = the reservation date-time.',
event: 'event/attraction. name = the event, start_time/end_time = full ISO.',
flight: 'flight. vehicle_number = flight number, from_code/to_code = IATA codes, times = full ISO, price/currency = total fare.',
train: 'train. from_name/to_name = stations, vehicle_number = train number, times = full ISO, price/currency = total fare.',
bus: 'bus. from_name/to_name = stops, times = full ISO, price/currency = total fare.',
ferry: 'ferry/cruise. from_name/to_name = terminals/ports, times = full ISO, price/currency = total fare.',
car: 'rental car. from_name = pick-up location, to_name = return location (may differ), departure_time = pick-up, arrival_time = return, price/currency = total rental cost.',
hotel: 'hotel stay. name = hotel name, address = the hotel street address, checkin_time/checkout_time = full ISO date-time, price/currency = total paid.',
restaurant: 'restaurant booking. name = the restaurant, address = its street address, start_time = the reservation date-time, price/currency = total if shown.',
event: 'event/attraction. name = the event/ticket, address = the venue, start_time/end_time = full ISO, price/currency = ticket price.',
};
/** Keyword → reservation type, so an obvious document skips the costlier union/strong path. */
@@ -79,11 +82,21 @@ export function extractBookingRef(text: string): string | undefined {
// do, while the case-insensitive [A-Z0-9] class would otherwise grab a following prose
// word ("Confirmation\nThank you…" → "Thank") after a bare label.
const m = text.match(
/(?:PNR|Buchungs(?:code|nummer|referenz)|Booking\s*(?:reference|code|number)|Confirmation\s*(?:number|code)?|Reservierungsnummer|Reservation\s*(?:No\.?|Number|Nr\.?)|Best(?:ä|ae)tigungs[-\s]?(?:nummer|code)|Reference)\s*:?\s*((?=[A-Z0-9]*\d)[A-Z0-9]{5,})/i,
/(?:PNR|Buchungs(?:code|nummer|referenz)|Booking\s*(?:reference|code|number)|Confirmation\s*(?:number|code)?|Reservierungsnummer|Reservation\s*(?:No\.?|Number|Nr\.?)|Best(?:ä|ae)tigungs[-\s]?(?:nummer|code)|(?:Expedia[-\s]*)?Reiseplan|Reference)\s*:?\s*((?=[A-Z0-9]*\d)[A-Z0-9]{5,})/i,
);
return m?.[1];
}
/** Currency symbol/code → ISO 4217, or undefined when none is recognised. */
export function normCurrency(token: string): string | undefined {
const u = token.toUpperCase();
if (u.includes('€')) return 'EUR';
if (u.includes('$')) return 'USD';
if (u.includes('£')) return 'GBP';
if (u.includes('¥')) return 'JPY';
return /^[A-Z]{3}$/.test(u) ? u : undefined;
}
/** The booking total, pulled deterministically (raw amount string + ISO currency). */
export function extractTotalPrice(text: string): { price: string; currency?: string } | null {
const strip = (s: string) => s.replace(/[€$£¥\s]/g, '');
@@ -174,11 +187,10 @@ async function extractSingle(text: string, ctx: RouterContext): Promise<FlatLike
}
/**
* Schicht 2 — fill the booking-wide fields the per-reservation extraction doesn't carry:
* the confirmation/PNR and the booking total. Applied to BOTH the deterministic vendor
* results AND the model output, so a vendor template that read the structured fields but
* whose narrow ref/price regex missed still gets the broad doc-wide deterministic value.
* Never overrides a value the source already provided.
* Schicht 2 — fill the booking-wide fields the per-reservation model call doesn't reliably
* carry: the confirmation/PNR and the booking total + its currency. The confirmation and a
* missing price are filled from the document; the currency is taken from the document's own
* symbol/code (authoritative — small models misread it), correcting the model where needed.
*/
function fillBookingWideFields(flats: Record<string, unknown>[], text: string): void {
const ref = extractBookingRef(text);
@@ -188,10 +200,12 @@ function fillBookingWideFields(flats: Record<string, unknown>[], text: string):
const priceMissing = (v: unknown) => v == null || (typeof v === 'string' && v.trim() === '');
flats.forEach((f, i) => {
if (!f.booking_reference && ref) f.booking_reference = ref;
// The total belongs to the booking, so attach it once (the first item).
if (i === 0 && total && priceMissing(f.price)) {
f.price = total.price;
if (f.currency == null) f.currency = total.currency;
// The total belongs to the booking, so handle it once (the first item).
if (i === 0 && total) {
if (priceMissing(f.price)) f.price = total.price;
// The document's own currency symbol/code is authoritative; let it override the
// model's guess (small models misread "¥" as "$").
if (total.currency) f.currency = total.currency;
}
});
}
@@ -203,15 +217,6 @@ function fillBookingWideFields(flats: Record<string, unknown>[], text: string):
export async function routeExtraction(text: string, ctx: RouterContext): Promise<{ kiItems: KiReservation[]; warnings: string[] }> {
const warnings: string[] = [];
// Schicht 0 — deterministic vendor templates (no LLM). Still top-up the booking-wide
// fields so a template misses on the ref/price doesn't drop them when the doc-wide
// deterministic extractor would have found them.
const vendor = matchVendorTemplate(text);
if (vendor && vendor.length > 0) {
fillBookingWideFields(vendor, text);
return { kiItems: nuExtractToKiReservations(vendor) as unknown as KiReservation[], warnings };
}
// Schicht 1 — exactly one model call.
let flats: FlatLike[];
try {
@@ -1,232 +0,0 @@
/**
* Schicht 0 — deterministic vendor templates.
*
* KItinerary already handles documents with machine-readable data (boarding-pass
* barcodes, UIC rail codes, embedded schema.org JSON-LD) upstream of the LLM. This
* layer extends the deterministic net to a handful of high-volume vendors whose plain
* PDFs carry NO barcode but a stable text layout (Booking.com, Expedia, Airbnb, the big
* airlines, Sixt/Europcar…). A matched template returns a fully-formed result with ZERO
* model inference — instant, free, and 100% repeatable — so the common case never loads
* the CPU. The LLM router only runs for the long tail.
*
* Templates emit the same flat field shape the router uses, so they feed the identical
* `nuExtractToKiReservations` mapper. Each template must be CONSERVATIVE: fire only on an
* unambiguous marker and only emit fields it can read with certainty — a wrong
* deterministic answer is worse than deferring to the model. This file is the seam where
* new vendor extractors are added; it ships with one worked example.
*/
import type { FlatType } from './flat-schemas';
export interface FlatReservation {
type: FlatType;
booking_reference?: string;
operator?: string;
name?: string;
from_name?: string;
to_name?: string;
departure_time?: string;
arrival_time?: string;
address?: string;
checkin_time?: string;
checkout_time?: string;
price?: string;
currency?: string;
[k: string]: unknown;
}
interface VendorTemplate {
name: string;
/** Cheap check: is this that vendor's document at all? */
match(text: string): boolean;
/** Pull the reservation(s); return [] if the layout didn't parse as expected. */
extract(text: string): FlatReservation[];
}
/** Parse a German/EU numeric date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */
function deDateTime(text: string): string | null {
const m = text.match(/(\d{2})\.(\d{2})\.(\d{4})(?:[,\s]+(\d{1,2}):(\d{2}))?/);
if (!m) return null;
const [, d, mo, y, h, mi] = m;
return `${y}-${mo}-${d}` + (h ? `T${h.padStart(2, '0')}:${mi}:00` : '');
}
/** German month name/abbreviation → month number (matched on the first three letters). */
const DE_MONTHS: Record<string, number> = {
jan: 1, feb: 2, 'mär': 3, mrz: 3, apr: 4, mai: 5, jun: 6, jul: 7, aug: 8, sep: 9, okt: 10, nov: 11, dez: 12,
};
/** English month name/abbreviation → month number (matched on the first three letters). */
const EN_MONTHS: Record<string, number> = {
jan: 1, feb: 2, mar: 3, apr: 4, may: 5, jun: 6, jul: 7, aug: 8, sep: 9, oct: 10, nov: 11, dec: 12,
};
/** Parse a German long-form date ("3. Mai 2026", "27. Aug. 2025") to an ISO date — no time. */
function deLongDate(text: string): string | null {
const m = text.match(/(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s+(\d{4})/);
if (!m) return null;
const mo = DE_MONTHS[m[2].slice(0, 3).toLowerCase()];
if (!mo) return null;
return `${m[3]}-${String(mo).padStart(2, '0')}-${m[1].padStart(2, '0')}`;
}
/**
* Parse an English date + optional time to ISO. Tolerates a comma after the day
* ("Aug 5, 2025") and a 12-hour clock ("Aug 23 2025 01:30 PM" → 13:30) as well as the
* plain 24-hour form ("Aug 23 2025 13:30", "Aug 30 2025").
*/
function enDateTime(text: string): string | null {
const m = text.match(/([A-Za-z]{3,})\.?\s+(\d{1,2}),?\s+(\d{4})(?:[,\s]+(\d{1,2}):(\d{2})\s*([AaPp][Mm])?)?/);
if (!m) return null;
const mo = EN_MONTHS[m[1].slice(0, 3).toLowerCase()];
if (!mo) return null;
const date = `${m[3]}-${String(mo).padStart(2, '0')}-${m[2].padStart(2, '0')}`;
if (!m[4]) return date;
let h = parseInt(m[4], 10);
const meridiem = m[6]?.toLowerCase();
if (meridiem === 'pm' && h !== 12) h += 12;
else if (meridiem === 'am' && h === 12) h = 0;
return `${date}T${String(h).padStart(2, '0')}:${m[5]}:00`;
}
/** Symbol/code → ISO 4217, or undefined when none is recognised. */
export function normCurrency(token: string): string | undefined {
const u = token.toUpperCase();
if (u.includes('€')) return 'EUR';
if (u.includes('$')) return 'USD';
if (u.includes('£')) return 'GBP';
if (u.includes('¥')) return 'JPY';
return /^[A-Z]{3}$/.test(u) ? u : undefined;
}
/** Same, but defaults to EUR for the EU-centric broker vouchers. */
function moneyCurrency(token: string | undefined): string {
return normCurrency(token ?? '') ?? 'EUR';
}
/**
* Example: Sixt rental confirmation. Sixt print-PDFs carry no barcode but a stable
* "Reservierungsnummer" + Anmietung/Rückgabe block. Conservative: only fires on the Sixt
* marker, only emits fields it can read unambiguously, and bails to the LLM otherwise.
*/
const sixt: VendorTemplate = {
name: 'sixt-rental',
match: (t) => /\bSIXT\b/i.test(t) && /Reservierungsnummer/i.test(t),
extract: (t) => {
const ref = t.match(/Reservierungsnummer:?\s*([A-Z0-9]{6,})/i)?.[1];
const pickup = t.match(/Anmietung:?\s*(.+)/i)?.[1]?.trim();
const dropoff = t.match(/R(?:ü|ue)ckgabe:?\s*(.+)/i)?.[1]?.trim();
const pickupTime = pickup ? deDateTime(t.slice(t.indexOf(pickup))) : null;
const dropoffTime = dropoff ? deDateTime(t.slice(t.indexOf(dropoff))) : null;
// Need at least a reference and both endpoints with dates to trust the template.
if (!ref || !pickup || !dropoff || !pickupTime || !dropoffTime) return [];
const place = (s: string) => s.replace(/\s*[-]\s*\d{2}\.\d{2}\.\d{4}.*$/, '').trim();
const priceM = t.match(/Gesamtpreis:?\s*([\d.,]+)\s*(EUR|€)/i);
return [
{
type: 'car',
operator: 'SIXT',
booking_reference: ref,
from_name: place(pickup),
to_name: place(dropoff),
departure_time: pickupTime,
arrival_time: dropoffTime,
...(priceM ? { price: priceM[1], currency: 'EUR' } : {}),
},
];
},
};
/**
* Expedia receipt ("Beleg"). Expedia's German confirmation PDFs carry no barcode but a
* stable "Buchungsdetails" block — hotel name, address, Anreise/Abreise — and an
* "Expedia-Reiseplan" number + "Gesamtpreis". The text layer reads these cleanly even
* when the local model misses the address/price, so pull the hotel deterministically.
* (A combined hotel+flight receipt only yields the hotel here — the airline lines carry
* no IATA flight number, which the model can't reliably turn into legs either.)
*/
const expedia: VendorTemplate = {
name: 'expedia-hotel',
match: (t) => /Expedia-Reiseplan/i.test(t) && /Buchungsdetails/i.test(t) && /Anreise/i.test(t),
extract: (t) => {
const ref = t.match(/Expedia-Reiseplan:?\s*(\d{6,})/i)?.[1];
const block = t.match(/Buchungsdetails\s*\n([\s\S]*?)\nAnreise:/i)?.[1];
const checkin = deLongDate(t.match(/Anreise:?\s*([^\n]+)/i)?.[1] ?? '');
const checkout = deLongDate(t.match(/Abreise:?\s*([^\n]+)/i)?.[1] ?? '');
if (!block || !checkin || !checkout) return [];
const lines = block.split('\n').map((s) => s.trim()).filter(Boolean);
const name = lines[0];
if (!name) return [];
const address = lines.slice(1).join(', ') || undefined;
const priceM = t.match(/Gesamtpreis\s*([\d.,]+)\s*€/i);
return [
{
type: 'hotel',
name,
...(ref ? { booking_reference: ref } : {}),
...(address ? { address } : {}),
checkin_time: checkin,
checkout_time: checkout,
...(priceM ? { price: priceM[1], currency: 'EUR' } : {}),
},
];
},
};
/**
* Broker rental-car voucher (vipcars and the like). These print a stable
* "PICK-UP DETAILS / DROP-OFF DETAILS" pair — each followed by the depot name and an
* English "Mon DD YYYY HH:MM" line — plus a "Reservation No." and a "Payment Details"
* total. The model regularly fails the two-column English date, so read it here.
*/
const brokerRental: VendorTemplate = {
name: 'broker-rental-voucher',
match: (t) => /PICK-?UP DETAILS/i.test(t) && /DROP-?OFF DETAILS/i.test(t) && /Reservation\s*No/i.test(t),
extract: (t) => {
const ref = t.match(/Reservation\s*No\.?:?\s*([A-Z0-9]{5,})/i)?.[1];
const block = (label: RegExp) =>
t.match(new RegExp(label.source + String.raw`\s*\n([^\n]+)\n([A-Za-z]{3,}\.?\s+\d{1,2},?\s+\d{4}[^\n]*)`, 'i'));
const pickup = block(/PICK-?UP DETAILS/);
const dropoff = block(/DROP-?OFF DETAILS/);
const pickupTime = pickup ? enDateTime(pickup[2]) : null;
const dropoffTime = dropoff ? enDateTime(dropoff[2]) : null;
if (!ref || !pickup || !dropoff || !pickupTime || !dropoffTime) return [];
const company = t
.match(/SUPPLIER DETAILS\s*\n([^\n]+?)(?:\s+Supplier Reference|\n|$)/i)?.[1]
?.trim()
.replace(/\s*\(V\d+\)\s*$/i, ''); // drop the broker's "(V2)" supplier-version tag
// Read the first amount in the "Payment Details" block; accept the currency on either
// side of the number and derive it (don't assume EUR), so non-EUR vouchers still get a price.
const priceM = t.match(
/Payment Details[\s\S]{0,120}?(?:(EUR|USD|GBP|CHF|€|\$|£)\s*([\d.,]+)|([\d.,]+)\s*(EUR|USD|GBP|CHF|€|\$|£))/i,
);
const price = priceM ? priceM[2] ?? priceM[3] : undefined;
return [
{
type: 'car',
...(company ? { operator: company } : {}),
booking_reference: ref,
from_name: pickup[1].trim(),
to_name: dropoff[1].trim(),
departure_time: pickupTime,
arrival_time: dropoffTime,
...(price ? { price, currency: moneyCurrency(priceM![1] ?? priceM![4]) } : {}),
},
];
},
};
const TEMPLATES: VendorTemplate[] = [sixt, expedia, brokerRental];
/**
* Try each vendor template; return the first match's result, or null when no template
* applies (the router then falls through to the LLM). A template that matches its vendor
* but can't parse the layout returns [] and is skipped.
*/
export function matchVendorTemplate(text: string): FlatReservation[] | null {
for (const t of TEMPLATES) {
if (!t.match(text)) continue;
const result = t.extract(text);
if (result.length > 0) return result;
}
return null;
}
@@ -0,0 +1,50 @@
import { describe, it, expect } from 'vitest';
import { extractBookingRef, extractTotalPrice, normCurrency } from '../../../../src/nest/llm-parse/router/extraction-router';
describe('extractBookingRef', () => {
it('reads an Airbnb "Bestätigungs-Code"', () => {
expect(extractBookingRef('Bestätigungs-Code\nHMHJ9RTEEK')).toBe('HMHJ9RTEEK');
});
it('prefers the customer "Reservation No." over a later "Supplier Reference"', () => {
expect(extractBookingRef('Reservation No.: G72820729\nSUPPLIER DETAILS\nSupplier Reference: IT587200464')).toBe('G72820729');
});
it('reads an Expedia "Reiseplan" number', () => {
expect(extractBookingRef('Expedia-Reiseplan: 73222406755286')).toBe('73222406755286');
});
it('reads a classic "Buchungsnummer" / "PNR"', () => {
expect(extractBookingRef('Buchungsnummer: ABC123')).toBe('ABC123');
expect(extractBookingRef('PNR XY7Q9Z')).toBe('XY7Q9Z');
});
it('does not capture a prose word after a bare "Confirmation"/"reference"', () => {
expect(extractBookingRef('Booking Confirmation\n\nThank you for choosing us')).toBeUndefined();
expect(extractBookingRef('For future reference please retain this email')).toBeUndefined();
});
});
describe('extractTotalPrice', () => {
it('reads a labeled German total', () => {
expect(extractTotalPrice('Gesamtpreis 61,23 €')).toEqual({ price: '61,23', currency: 'EUR' });
});
it('reads an Airbnb "Bezahlter Betrag"', () => {
expect(extractTotalPrice('Bezahlter Betrag\n651,86 €')).toEqual({ price: '651,86', currency: 'EUR' });
});
it('falls back to a standalone ¥ voucher price (JPY) with no nearby label', () => {
expect(extractTotalPrice('Price (consumption tax included)\n金額(消費税込)\n¥9,400\nAdult')).toEqual({ price: '9,400', currency: 'JPY' });
});
it('returns null when there is neither a labeled nor a symbol amount', () => {
expect(extractTotalPrice('Just some terms and conditions, no price here.')).toBeNull();
});
});
describe('normCurrency', () => {
it('maps symbols and codes to ISO 4217', () => {
expect(normCurrency('€')).toBe('EUR');
expect(normCurrency('¥')).toBe('JPY');
expect(normCurrency('$')).toBe('USD');
expect(normCurrency('CHF')).toBe('CHF');
});
it('returns undefined for an unrecognised token', () => {
expect(normCurrency('')).toBeUndefined();
expect(normCurrency('hello world')).toBeUndefined();
});
});
@@ -1,160 +0,0 @@
import { describe, it, expect } from 'vitest';
import { matchVendorTemplate } from '../../../../src/nest/llm-parse/router/vendor-templates';
import { extractBookingRef, extractTotalPrice } from '../../../../src/nest/llm-parse/router/extraction-router';
// The snippets below mirror the pdf-parse text layer of real confirmation PDFs
// (Expedia hotel receipt, Airbnb booking, a broker rental-car voucher).
const EXPEDIA_HOTEL = `Beleg
Expedia-Reiseplan: 73222406755286
Buchungsdatum: 27. Aug. 2025
Buchungsdetails
Mercure Tokyo Haneda Airport
1 Chome-2-11 Haneda, Ota City, Tokyo, 144-0043 Japan
Anreise: 3. Mai 2026
Abreise: 22. Mai 2026
1 Zimmer x 19 Nächte
Zahlungsdetails
Steuern und Gebühren 1.195,07 €
Gesamtpreis 3.516,13 €
Bezahlt`;
const AIRBNB = `Zwei-Zimmer-Wohnung zwischen Venedig und
Treviso!
Check-in
15:00
Sa., 23. Aug.
Check-out
10:00
Sa., 30. Aug.
Bestätigungs-Code
HMHJ9RTEEK
Adresse
Via Aldo Moro, 47 n. 15, Quarto d'Altino, Venetien 30020, Italien
Bezahlter Betrag
651,86 €`;
const BROKER_RENTAL = `Reservation No.: G72820729
MAIN DRIVER'S NAME: Felix Pakulat
SUPPLIER DETAILS
SICILY BY CAR (V2) Supplier Reference: IT587200464
PICK-UP DETAILS
Venice Marco Polo Airport
Aug 23 2025 13:30
DROP-OFF DETAILS
Venice Marco Polo Airport
Aug 30 2025 12:30
Payment Details
Amount Payable to
Supplier:
(Payable at Pick-up)
EUR 300.21`;
describe('expedia-hotel vendor template', () => {
it('extracts hotel name, address, stay dates, price and Reiseplan number', () => {
const out = matchVendorTemplate(EXPEDIA_HOTEL);
expect(out).toEqual([
{
type: 'hotel',
name: 'Mercure Tokyo Haneda Airport',
booking_reference: '73222406755286',
address: '1 Chome-2-11 Haneda, Ota City, Tokyo, 144-0043 Japan',
checkin_time: '2026-05-03',
checkout_time: '2026-05-22',
price: '3.516,13',
currency: 'EUR',
},
]);
});
it('parses German abbreviated months (e.g. "4. Feb. 2026")', () => {
const bnb = EXPEDIA_HOTEL.replace('Anreise: 3. Mai 2026', 'Anreise: 4. Feb. 2026').replace(
'Abreise: 22. Mai 2026',
'Abreise: 6. Feb. 2026',
);
const out = matchVendorTemplate(bnb);
expect(out?.[0]).toMatchObject({ checkin_time: '2026-02-04', checkout_time: '2026-02-06' });
});
});
describe('broker-rental-voucher vendor template', () => {
it('extracts pickup/return depots, English date-times, price and the customer reservation no.', () => {
const out = matchVendorTemplate(BROKER_RENTAL);
expect(out).toEqual([
{
type: 'car',
operator: 'SICILY BY CAR', // the "(V2)" supplier-version tag is stripped
booking_reference: 'G72820729', // the customer ref, not the supplier reference
from_name: 'Venice Marco Polo Airport',
to_name: 'Venice Marco Polo Airport',
departure_time: '2025-08-23T13:30:00',
arrival_time: '2025-08-30T12:30:00',
price: '300.21',
currency: 'EUR',
},
]);
});
});
describe('non-matching documents', () => {
it('returns null when no template applies', () => {
expect(matchVendorTemplate(AIRBNB)).toBeNull();
expect(matchVendorTemplate('just some unrelated text')).toBeNull();
});
});
describe('broker template — date & price variants', () => {
const VARIANT = `Reservation No.: AB123456
SUPPLIER DETAILS
GREEN MOTION Supplier Reference: XYZ
PICK-UP DETAILS
London Heathrow
Aug 5, 2025 09:00 AM
DROP-OFF DETAILS
London Heathrow
Aug 12, 2025 05:30 PM
Payment Details
Total to pay
150.00 GBP`;
it('handles a comma date, a 12-hour clock and a trailing non-EUR currency', () => {
const out = matchVendorTemplate(VARIANT);
expect(out?.[0]).toMatchObject({
booking_reference: 'AB123456',
departure_time: '2025-08-05T09:00:00', // 09:00 AM
arrival_time: '2025-08-12T17:30:00', // 05:30 PM → 17:30
price: '150.00',
currency: 'GBP', // derived, not hard-coded EUR
});
});
});
describe('extractBookingRef', () => {
it('reads an Airbnb "Bestätigungs-Code"', () => {
expect(extractBookingRef(AIRBNB)).toBe('HMHJ9RTEEK');
});
it('prefers the customer "Reservation No." over a later "Supplier Reference"', () => {
expect(extractBookingRef(BROKER_RENTAL)).toBe('G72820729');
});
it('still reads a classic "Buchungsnummer" / "PNR"', () => {
expect(extractBookingRef('Buchungsnummer: ABC123')).toBe('ABC123');
expect(extractBookingRef('PNR XY7Q9Z')).toBe('XY7Q9Z');
});
it('does not capture a prose word after a bare "Confirmation"/"reference"', () => {
expect(extractBookingRef('Booking Confirmation\n\nThank you for choosing us')).toBeUndefined();
expect(extractBookingRef('For future reference please retain this email')).toBeUndefined();
});
});
describe('extractTotalPrice', () => {
it('reads an Airbnb "Bezahlter Betrag"', () => {
expect(extractTotalPrice(AIRBNB)).toEqual({ price: '651,86', currency: 'EUR' });
});
it('falls back to a standalone ¥ voucher price (JPY) with no nearby label', () => {
const voucher = 'Price (consumption tax included)\n金額(消費税込)\nPark Admission Date\n¥9,400\nAdult\n1-Day Passport';
expect(extractTotalPrice(voucher)).toEqual({ price: '9,400', currency: 'JPY' });
});
it('returns null when there is neither a labeled nor a symbol amount', () => {
expect(extractTotalPrice('Just some terms and conditions, no price here.')).toBeNull();
});
});