From c7f5694f635f2182785c821f8801f0bb3d2940c5 Mon Sep 17 00:00:00 2001 From: Maurice Date: Fri, 26 Jun 2026 09:08:25 +0200 Subject: [PATCH] feat(extract): add Expedia and rental-broker booking templates Pull the hotel/rental fields these vendors print in a stable text layout (name, address, stay/pickup dates, price, reference) deterministically, so the import stops depending on the local model for them. Handles German long/abbreviated months and English dates incl. 12-hour and comma forms. --- .../nest/llm-parse/router/vendor-templates.ts | 130 +++++++++++++++++- 1 file changed, 128 insertions(+), 2 deletions(-) diff --git a/server/src/nest/llm-parse/router/vendor-templates.ts b/server/src/nest/llm-parse/router/vendor-templates.ts index bd2ea973..4178bad6 100644 --- a/server/src/nest/llm-parse/router/vendor-templates.ts +++ b/server/src/nest/llm-parse/router/vendor-templates.ts @@ -43,7 +43,7 @@ interface VendorTemplate { extract(text: string): FlatReservation[]; } -/** Parse a German/EU date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */ +/** Parse a German/EU numeric date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */ function deDateTime(text: string): string | null { const m = text.match(/(\d{2})\.(\d{2})\.(\d{4})(?:[,\s]+(\d{1,2}):(\d{2}))?/); if (!m) return null; @@ -51,6 +51,53 @@ function deDateTime(text: string): string | null { return `${y}-${mo}-${d}` + (h ? `T${h.padStart(2, '0')}:${mi}:00` : ''); } +/** German month name/abbreviation → month number (matched on the first three letters). */ +const DE_MONTHS: Record = { + jan: 1, feb: 2, 'mär': 3, mrz: 3, apr: 4, mai: 5, jun: 6, jul: 7, aug: 8, sep: 9, okt: 10, nov: 11, dez: 12, +}; +/** English month name/abbreviation → month number (matched on the first three letters). */ +const EN_MONTHS: Record = { + jan: 1, feb: 2, mar: 3, apr: 4, may: 5, jun: 6, jul: 7, aug: 8, sep: 9, oct: 10, nov: 11, dec: 12, +}; + +/** Parse a German long-form date ("3. Mai 2026", "27. Aug. 2025") to an ISO date — no time. */ +function deLongDate(text: string): string | null { + const m = text.match(/(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s+(\d{4})/); + if (!m) return null; + const mo = DE_MONTHS[m[2].slice(0, 3).toLowerCase()]; + if (!mo) return null; + return `${m[3]}-${String(mo).padStart(2, '0')}-${m[1].padStart(2, '0')}`; +} + +/** + * Parse an English date + optional time to ISO. Tolerates a comma after the day + * ("Aug 5, 2025") and a 12-hour clock ("Aug 23 2025 01:30 PM" → 13:30) as well as the + * plain 24-hour form ("Aug 23 2025 13:30", "Aug 30 2025"). + */ +function enDateTime(text: string): string | null { + const m = text.match(/([A-Za-z]{3,})\.?\s+(\d{1,2}),?\s+(\d{4})(?:[,\s]+(\d{1,2}):(\d{2})\s*([AaPp][Mm])?)?/); + if (!m) return null; + const mo = EN_MONTHS[m[1].slice(0, 3).toLowerCase()]; + if (!mo) return null; + const date = `${m[3]}-${String(mo).padStart(2, '0')}-${m[2].padStart(2, '0')}`; + if (!m[4]) return date; + let h = parseInt(m[4], 10); + const meridiem = m[6]?.toLowerCase(); + if (meridiem === 'pm' && h !== 12) h += 12; + else if (meridiem === 'am' && h === 12) h = 0; + return `${date}T${String(h).padStart(2, '0')}:${m[5]}:00`; +} + +/** Symbol/code → ISO 4217 (defaults to EUR for the EU-centric broker vouchers). */ +function moneyCurrency(token: string | undefined): string { + if (!token) return 'EUR'; + const u = token.toUpperCase(); + if (u.includes('€')) return 'EUR'; + if (u.includes('$')) return 'USD'; + if (u.includes('£')) return 'GBP'; + return /^[A-Z]{3}$/.test(u) ? u : 'EUR'; +} + /** * Example: Sixt rental confirmation. Sixt print-PDFs carry no barcode but a stable * "Reservierungsnummer" + Anmietung/Rückgabe block. Conservative: only fires on the Sixt @@ -84,7 +131,86 @@ const sixt: VendorTemplate = { }, }; -const TEMPLATES: VendorTemplate[] = [sixt]; +/** + * Expedia receipt ("Beleg"). Expedia's German confirmation PDFs carry no barcode but a + * stable "Buchungsdetails" block — hotel name, address, Anreise/Abreise — and an + * "Expedia-Reiseplan" number + "Gesamtpreis". The text layer reads these cleanly even + * when the local model misses the address/price, so pull the hotel deterministically. + * (A combined hotel+flight receipt only yields the hotel here — the airline lines carry + * no IATA flight number, which the model can't reliably turn into legs either.) + */ +const expedia: VendorTemplate = { + name: 'expedia-hotel', + match: (t) => /Expedia-Reiseplan/i.test(t) && /Buchungsdetails/i.test(t) && /Anreise/i.test(t), + extract: (t) => { + const ref = t.match(/Expedia-Reiseplan:?\s*(\d{6,})/i)?.[1]; + const block = t.match(/Buchungsdetails\s*\n([\s\S]*?)\nAnreise:/i)?.[1]; + const checkin = deLongDate(t.match(/Anreise:?\s*([^\n]+)/i)?.[1] ?? ''); + const checkout = deLongDate(t.match(/Abreise:?\s*([^\n]+)/i)?.[1] ?? ''); + if (!block || !checkin || !checkout) return []; + const lines = block.split('\n').map((s) => s.trim()).filter(Boolean); + const name = lines[0]; + if (!name) return []; + const address = lines.slice(1).join(', ') || undefined; + const priceM = t.match(/Gesamtpreis\s*([\d.,]+)\s*€/i); + return [ + { + type: 'hotel', + name, + ...(ref ? { booking_reference: ref } : {}), + ...(address ? { address } : {}), + checkin_time: checkin, + checkout_time: checkout, + ...(priceM ? { price: priceM[1], currency: 'EUR' } : {}), + }, + ]; + }, +}; + +/** + * Broker rental-car voucher (vipcars and the like). These print a stable + * "PICK-UP DETAILS / DROP-OFF DETAILS" pair — each followed by the depot name and an + * English "Mon DD YYYY HH:MM" line — plus a "Reservation No." and a "Payment Details" + * total. The model regularly fails the two-column English date, so read it here. + */ +const brokerRental: VendorTemplate = { + name: 'broker-rental-voucher', + match: (t) => /PICK-?UP DETAILS/i.test(t) && /DROP-?OFF DETAILS/i.test(t) && /Reservation\s*No/i.test(t), + extract: (t) => { + const ref = t.match(/Reservation\s*No\.?:?\s*([A-Z0-9]{5,})/i)?.[1]; + const block = (label: RegExp) => + t.match(new RegExp(label.source + String.raw`\s*\n([^\n]+)\n([A-Za-z]{3,}\.?\s+\d{1,2},?\s+\d{4}[^\n]*)`, 'i')); + const pu = block(/PICK-?UP DETAILS/); + const dof = block(/DROP-?OFF DETAILS/); + const puTime = pu ? enDateTime(pu[2]) : null; + const doTime = dof ? enDateTime(dof[2]) : null; + if (!ref || !pu || !dof || !puTime || !doTime) return []; + const company = t + .match(/SUPPLIER DETAILS\s*\n([^\n]+?)(?:\s+Supplier Reference|\n|$)/i)?.[1] + ?.trim() + .replace(/\s*\(V\d+\)\s*$/i, ''); // drop the broker's "(V2)" supplier-version tag + // Read the first amount in the "Payment Details" block; accept the currency on either + // side of the number and derive it (don't assume EUR), so non-EUR vouchers still get a price. + const priceM = t.match( + /Payment Details[\s\S]{0,120}?(?:(EUR|USD|GBP|CHF|€|\$|£)\s*([\d.,]+)|([\d.,]+)\s*(EUR|USD|GBP|CHF|€|\$|£))/i, + ); + const price = priceM ? priceM[2] ?? priceM[3] : undefined; + return [ + { + type: 'car', + ...(company ? { operator: company } : {}), + booking_reference: ref, + from_name: pu[1].trim(), + to_name: dof[1].trim(), + departure_time: puTime, + arrival_time: doTime, + ...(price ? { price, currency: moneyCurrency(priceM![1] ?? priceM![4]) } : {}), + }, + ]; + }, +}; + +const TEMPLATES: VendorTemplate[] = [sixt, expedia, brokerRental]; /** * Try each vendor template; return the first match's result, or null when no template