mirror of
https://github.com/mauriceboe/TREK.git
synced 2026-06-30 10:41:49 +00:00
feat(extract): add Expedia and rental-broker booking templates
Pull the hotel/rental fields these vendors print in a stable text layout (name, address, stay/pickup dates, price, reference) deterministically, so the import stops depending on the local model for them. Handles German long/abbreviated months and English dates incl. 12-hour and comma forms.
This commit is contained in:
@@ -43,7 +43,7 @@ interface VendorTemplate {
|
||||
extract(text: string): FlatReservation[];
|
||||
}
|
||||
|
||||
/** Parse a German/EU date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */
|
||||
/** Parse a German/EU numeric date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */
|
||||
function deDateTime(text: string): string | null {
|
||||
const m = text.match(/(\d{2})\.(\d{2})\.(\d{4})(?:[,\s]+(\d{1,2}):(\d{2}))?/);
|
||||
if (!m) return null;
|
||||
@@ -51,6 +51,53 @@ function deDateTime(text: string): string | null {
|
||||
return `${y}-${mo}-${d}` + (h ? `T${h.padStart(2, '0')}:${mi}:00` : '');
|
||||
}
|
||||
|
||||
/** German month name/abbreviation → month number (matched on the first three letters). */
|
||||
const DE_MONTHS: Record<string, number> = {
|
||||
jan: 1, feb: 2, 'mär': 3, mrz: 3, apr: 4, mai: 5, jun: 6, jul: 7, aug: 8, sep: 9, okt: 10, nov: 11, dez: 12,
|
||||
};
|
||||
/** English month name/abbreviation → month number (matched on the first three letters). */
|
||||
const EN_MONTHS: Record<string, number> = {
|
||||
jan: 1, feb: 2, mar: 3, apr: 4, may: 5, jun: 6, jul: 7, aug: 8, sep: 9, oct: 10, nov: 11, dec: 12,
|
||||
};
|
||||
|
||||
/** Parse a German long-form date ("3. Mai 2026", "27. Aug. 2025") to an ISO date — no time. */
|
||||
function deLongDate(text: string): string | null {
|
||||
const m = text.match(/(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s+(\d{4})/);
|
||||
if (!m) return null;
|
||||
const mo = DE_MONTHS[m[2].slice(0, 3).toLowerCase()];
|
||||
if (!mo) return null;
|
||||
return `${m[3]}-${String(mo).padStart(2, '0')}-${m[1].padStart(2, '0')}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse an English date + optional time to ISO. Tolerates a comma after the day
|
||||
* ("Aug 5, 2025") and a 12-hour clock ("Aug 23 2025 01:30 PM" → 13:30) as well as the
|
||||
* plain 24-hour form ("Aug 23 2025 13:30", "Aug 30 2025").
|
||||
*/
|
||||
function enDateTime(text: string): string | null {
|
||||
const m = text.match(/([A-Za-z]{3,})\.?\s+(\d{1,2}),?\s+(\d{4})(?:[,\s]+(\d{1,2}):(\d{2})\s*([AaPp][Mm])?)?/);
|
||||
if (!m) return null;
|
||||
const mo = EN_MONTHS[m[1].slice(0, 3).toLowerCase()];
|
||||
if (!mo) return null;
|
||||
const date = `${m[3]}-${String(mo).padStart(2, '0')}-${m[2].padStart(2, '0')}`;
|
||||
if (!m[4]) return date;
|
||||
let h = parseInt(m[4], 10);
|
||||
const meridiem = m[6]?.toLowerCase();
|
||||
if (meridiem === 'pm' && h !== 12) h += 12;
|
||||
else if (meridiem === 'am' && h === 12) h = 0;
|
||||
return `${date}T${String(h).padStart(2, '0')}:${m[5]}:00`;
|
||||
}
|
||||
|
||||
/** Symbol/code → ISO 4217 (defaults to EUR for the EU-centric broker vouchers). */
|
||||
function moneyCurrency(token: string | undefined): string {
|
||||
if (!token) return 'EUR';
|
||||
const u = token.toUpperCase();
|
||||
if (u.includes('€')) return 'EUR';
|
||||
if (u.includes('$')) return 'USD';
|
||||
if (u.includes('£')) return 'GBP';
|
||||
return /^[A-Z]{3}$/.test(u) ? u : 'EUR';
|
||||
}
|
||||
|
||||
/**
|
||||
* Example: Sixt rental confirmation. Sixt print-PDFs carry no barcode but a stable
|
||||
* "Reservierungsnummer" + Anmietung/Rückgabe block. Conservative: only fires on the Sixt
|
||||
@@ -84,7 +131,86 @@ const sixt: VendorTemplate = {
|
||||
},
|
||||
};
|
||||
|
||||
const TEMPLATES: VendorTemplate[] = [sixt];
|
||||
/**
|
||||
* Expedia receipt ("Beleg"). Expedia's German confirmation PDFs carry no barcode but a
|
||||
* stable "Buchungsdetails" block — hotel name, address, Anreise/Abreise — and an
|
||||
* "Expedia-Reiseplan" number + "Gesamtpreis". The text layer reads these cleanly even
|
||||
* when the local model misses the address/price, so pull the hotel deterministically.
|
||||
* (A combined hotel+flight receipt only yields the hotel here — the airline lines carry
|
||||
* no IATA flight number, which the model can't reliably turn into legs either.)
|
||||
*/
|
||||
const expedia: VendorTemplate = {
|
||||
name: 'expedia-hotel',
|
||||
match: (t) => /Expedia-Reiseplan/i.test(t) && /Buchungsdetails/i.test(t) && /Anreise/i.test(t),
|
||||
extract: (t) => {
|
||||
const ref = t.match(/Expedia-Reiseplan:?\s*(\d{6,})/i)?.[1];
|
||||
const block = t.match(/Buchungsdetails\s*\n([\s\S]*?)\nAnreise:/i)?.[1];
|
||||
const checkin = deLongDate(t.match(/Anreise:?\s*([^\n]+)/i)?.[1] ?? '');
|
||||
const checkout = deLongDate(t.match(/Abreise:?\s*([^\n]+)/i)?.[1] ?? '');
|
||||
if (!block || !checkin || !checkout) return [];
|
||||
const lines = block.split('\n').map((s) => s.trim()).filter(Boolean);
|
||||
const name = lines[0];
|
||||
if (!name) return [];
|
||||
const address = lines.slice(1).join(', ') || undefined;
|
||||
const priceM = t.match(/Gesamtpreis\s*([\d.,]+)\s*€/i);
|
||||
return [
|
||||
{
|
||||
type: 'hotel',
|
||||
name,
|
||||
...(ref ? { booking_reference: ref } : {}),
|
||||
...(address ? { address } : {}),
|
||||
checkin_time: checkin,
|
||||
checkout_time: checkout,
|
||||
...(priceM ? { price: priceM[1], currency: 'EUR' } : {}),
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Broker rental-car voucher (vipcars and the like). These print a stable
|
||||
* "PICK-UP DETAILS / DROP-OFF DETAILS" pair — each followed by the depot name and an
|
||||
* English "Mon DD YYYY HH:MM" line — plus a "Reservation No." and a "Payment Details"
|
||||
* total. The model regularly fails the two-column English date, so read it here.
|
||||
*/
|
||||
const brokerRental: VendorTemplate = {
|
||||
name: 'broker-rental-voucher',
|
||||
match: (t) => /PICK-?UP DETAILS/i.test(t) && /DROP-?OFF DETAILS/i.test(t) && /Reservation\s*No/i.test(t),
|
||||
extract: (t) => {
|
||||
const ref = t.match(/Reservation\s*No\.?:?\s*([A-Z0-9]{5,})/i)?.[1];
|
||||
const block = (label: RegExp) =>
|
||||
t.match(new RegExp(label.source + String.raw`\s*\n([^\n]+)\n([A-Za-z]{3,}\.?\s+\d{1,2},?\s+\d{4}[^\n]*)`, 'i'));
|
||||
const pu = block(/PICK-?UP DETAILS/);
|
||||
const dof = block(/DROP-?OFF DETAILS/);
|
||||
const puTime = pu ? enDateTime(pu[2]) : null;
|
||||
const doTime = dof ? enDateTime(dof[2]) : null;
|
||||
if (!ref || !pu || !dof || !puTime || !doTime) return [];
|
||||
const company = t
|
||||
.match(/SUPPLIER DETAILS\s*\n([^\n]+?)(?:\s+Supplier Reference|\n|$)/i)?.[1]
|
||||
?.trim()
|
||||
.replace(/\s*\(V\d+\)\s*$/i, ''); // drop the broker's "(V2)" supplier-version tag
|
||||
// Read the first amount in the "Payment Details" block; accept the currency on either
|
||||
// side of the number and derive it (don't assume EUR), so non-EUR vouchers still get a price.
|
||||
const priceM = t.match(
|
||||
/Payment Details[\s\S]{0,120}?(?:(EUR|USD|GBP|CHF|€|\$|£)\s*([\d.,]+)|([\d.,]+)\s*(EUR|USD|GBP|CHF|€|\$|£))/i,
|
||||
);
|
||||
const price = priceM ? priceM[2] ?? priceM[3] : undefined;
|
||||
return [
|
||||
{
|
||||
type: 'car',
|
||||
...(company ? { operator: company } : {}),
|
||||
booking_reference: ref,
|
||||
from_name: pu[1].trim(),
|
||||
to_name: dof[1].trim(),
|
||||
departure_time: puTime,
|
||||
arrival_time: doTime,
|
||||
...(price ? { price, currency: moneyCurrency(priceM![1] ?? priceM![4]) } : {}),
|
||||
},
|
||||
];
|
||||
},
|
||||
};
|
||||
|
||||
const TEMPLATES: VendorTemplate[] = [sixt, expedia, brokerRental];
|
||||
|
||||
/**
|
||||
* Try each vendor template; return the first match's result, or null when no template
|
||||
|
||||
Reference in New Issue
Block a user