mirror of
https://github.com/mauriceboe/TREK.git
synced 2026-06-30 18:46:00 +00:00
feat(extract): add Expedia and rental-broker booking templates
Pull the hotel/rental fields these vendors print in a stable text layout (name, address, stay/pickup dates, price, reference) deterministically, so the import stops depending on the local model for them. Handles German long/abbreviated months and English dates incl. 12-hour and comma forms.
This commit is contained in:
@@ -43,7 +43,7 @@ interface VendorTemplate {
|
|||||||
extract(text: string): FlatReservation[];
|
extract(text: string): FlatReservation[];
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Parse a German/EU date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */
|
/** Parse a German/EU numeric date + time ("24.12.2026, 10:00" / "24.12.2026 10:00 Uhr") to ISO. */
|
||||||
function deDateTime(text: string): string | null {
|
function deDateTime(text: string): string | null {
|
||||||
const m = text.match(/(\d{2})\.(\d{2})\.(\d{4})(?:[,\s]+(\d{1,2}):(\d{2}))?/);
|
const m = text.match(/(\d{2})\.(\d{2})\.(\d{4})(?:[,\s]+(\d{1,2}):(\d{2}))?/);
|
||||||
if (!m) return null;
|
if (!m) return null;
|
||||||
@@ -51,6 +51,53 @@ function deDateTime(text: string): string | null {
|
|||||||
return `${y}-${mo}-${d}` + (h ? `T${h.padStart(2, '0')}:${mi}:00` : '');
|
return `${y}-${mo}-${d}` + (h ? `T${h.padStart(2, '0')}:${mi}:00` : '');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** German month name/abbreviation → month number (matched on the first three letters). */
|
||||||
|
const DE_MONTHS: Record<string, number> = {
|
||||||
|
jan: 1, feb: 2, 'mär': 3, mrz: 3, apr: 4, mai: 5, jun: 6, jul: 7, aug: 8, sep: 9, okt: 10, nov: 11, dez: 12,
|
||||||
|
};
|
||||||
|
/** English month name/abbreviation → month number (matched on the first three letters). */
|
||||||
|
const EN_MONTHS: Record<string, number> = {
|
||||||
|
jan: 1, feb: 2, mar: 3, apr: 4, may: 5, jun: 6, jul: 7, aug: 8, sep: 9, oct: 10, nov: 11, dec: 12,
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Parse a German long-form date ("3. Mai 2026", "27. Aug. 2025") to an ISO date — no time. */
|
||||||
|
function deLongDate(text: string): string | null {
|
||||||
|
const m = text.match(/(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s+(\d{4})/);
|
||||||
|
if (!m) return null;
|
||||||
|
const mo = DE_MONTHS[m[2].slice(0, 3).toLowerCase()];
|
||||||
|
if (!mo) return null;
|
||||||
|
return `${m[3]}-${String(mo).padStart(2, '0')}-${m[1].padStart(2, '0')}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse an English date + optional time to ISO. Tolerates a comma after the day
|
||||||
|
* ("Aug 5, 2025") and a 12-hour clock ("Aug 23 2025 01:30 PM" → 13:30) as well as the
|
||||||
|
* plain 24-hour form ("Aug 23 2025 13:30", "Aug 30 2025").
|
||||||
|
*/
|
||||||
|
function enDateTime(text: string): string | null {
|
||||||
|
const m = text.match(/([A-Za-z]{3,})\.?\s+(\d{1,2}),?\s+(\d{4})(?:[,\s]+(\d{1,2}):(\d{2})\s*([AaPp][Mm])?)?/);
|
||||||
|
if (!m) return null;
|
||||||
|
const mo = EN_MONTHS[m[1].slice(0, 3).toLowerCase()];
|
||||||
|
if (!mo) return null;
|
||||||
|
const date = `${m[3]}-${String(mo).padStart(2, '0')}-${m[2].padStart(2, '0')}`;
|
||||||
|
if (!m[4]) return date;
|
||||||
|
let h = parseInt(m[4], 10);
|
||||||
|
const meridiem = m[6]?.toLowerCase();
|
||||||
|
if (meridiem === 'pm' && h !== 12) h += 12;
|
||||||
|
else if (meridiem === 'am' && h === 12) h = 0;
|
||||||
|
return `${date}T${String(h).padStart(2, '0')}:${m[5]}:00`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Symbol/code → ISO 4217 (defaults to EUR for the EU-centric broker vouchers). */
|
||||||
|
function moneyCurrency(token: string | undefined): string {
|
||||||
|
if (!token) return 'EUR';
|
||||||
|
const u = token.toUpperCase();
|
||||||
|
if (u.includes('€')) return 'EUR';
|
||||||
|
if (u.includes('$')) return 'USD';
|
||||||
|
if (u.includes('£')) return 'GBP';
|
||||||
|
return /^[A-Z]{3}$/.test(u) ? u : 'EUR';
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Example: Sixt rental confirmation. Sixt print-PDFs carry no barcode but a stable
|
* Example: Sixt rental confirmation. Sixt print-PDFs carry no barcode but a stable
|
||||||
* "Reservierungsnummer" + Anmietung/Rückgabe block. Conservative: only fires on the Sixt
|
* "Reservierungsnummer" + Anmietung/Rückgabe block. Conservative: only fires on the Sixt
|
||||||
@@ -84,7 +131,86 @@ const sixt: VendorTemplate = {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
const TEMPLATES: VendorTemplate[] = [sixt];
|
/**
|
||||||
|
* Expedia receipt ("Beleg"). Expedia's German confirmation PDFs carry no barcode but a
|
||||||
|
* stable "Buchungsdetails" block — hotel name, address, Anreise/Abreise — and an
|
||||||
|
* "Expedia-Reiseplan" number + "Gesamtpreis". The text layer reads these cleanly even
|
||||||
|
* when the local model misses the address/price, so pull the hotel deterministically.
|
||||||
|
* (A combined hotel+flight receipt only yields the hotel here — the airline lines carry
|
||||||
|
* no IATA flight number, which the model can't reliably turn into legs either.)
|
||||||
|
*/
|
||||||
|
const expedia: VendorTemplate = {
|
||||||
|
name: 'expedia-hotel',
|
||||||
|
match: (t) => /Expedia-Reiseplan/i.test(t) && /Buchungsdetails/i.test(t) && /Anreise/i.test(t),
|
||||||
|
extract: (t) => {
|
||||||
|
const ref = t.match(/Expedia-Reiseplan:?\s*(\d{6,})/i)?.[1];
|
||||||
|
const block = t.match(/Buchungsdetails\s*\n([\s\S]*?)\nAnreise:/i)?.[1];
|
||||||
|
const checkin = deLongDate(t.match(/Anreise:?\s*([^\n]+)/i)?.[1] ?? '');
|
||||||
|
const checkout = deLongDate(t.match(/Abreise:?\s*([^\n]+)/i)?.[1] ?? '');
|
||||||
|
if (!block || !checkin || !checkout) return [];
|
||||||
|
const lines = block.split('\n').map((s) => s.trim()).filter(Boolean);
|
||||||
|
const name = lines[0];
|
||||||
|
if (!name) return [];
|
||||||
|
const address = lines.slice(1).join(', ') || undefined;
|
||||||
|
const priceM = t.match(/Gesamtpreis\s*([\d.,]+)\s*€/i);
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
type: 'hotel',
|
||||||
|
name,
|
||||||
|
...(ref ? { booking_reference: ref } : {}),
|
||||||
|
...(address ? { address } : {}),
|
||||||
|
checkin_time: checkin,
|
||||||
|
checkout_time: checkout,
|
||||||
|
...(priceM ? { price: priceM[1], currency: 'EUR' } : {}),
|
||||||
|
},
|
||||||
|
];
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Broker rental-car voucher (vipcars and the like). These print a stable
|
||||||
|
* "PICK-UP DETAILS / DROP-OFF DETAILS" pair — each followed by the depot name and an
|
||||||
|
* English "Mon DD YYYY HH:MM" line — plus a "Reservation No." and a "Payment Details"
|
||||||
|
* total. The model regularly fails the two-column English date, so read it here.
|
||||||
|
*/
|
||||||
|
const brokerRental: VendorTemplate = {
|
||||||
|
name: 'broker-rental-voucher',
|
||||||
|
match: (t) => /PICK-?UP DETAILS/i.test(t) && /DROP-?OFF DETAILS/i.test(t) && /Reservation\s*No/i.test(t),
|
||||||
|
extract: (t) => {
|
||||||
|
const ref = t.match(/Reservation\s*No\.?:?\s*([A-Z0-9]{5,})/i)?.[1];
|
||||||
|
const block = (label: RegExp) =>
|
||||||
|
t.match(new RegExp(label.source + String.raw`\s*\n([^\n]+)\n([A-Za-z]{3,}\.?\s+\d{1,2},?\s+\d{4}[^\n]*)`, 'i'));
|
||||||
|
const pu = block(/PICK-?UP DETAILS/);
|
||||||
|
const dof = block(/DROP-?OFF DETAILS/);
|
||||||
|
const puTime = pu ? enDateTime(pu[2]) : null;
|
||||||
|
const doTime = dof ? enDateTime(dof[2]) : null;
|
||||||
|
if (!ref || !pu || !dof || !puTime || !doTime) return [];
|
||||||
|
const company = t
|
||||||
|
.match(/SUPPLIER DETAILS\s*\n([^\n]+?)(?:\s+Supplier Reference|\n|$)/i)?.[1]
|
||||||
|
?.trim()
|
||||||
|
.replace(/\s*\(V\d+\)\s*$/i, ''); // drop the broker's "(V2)" supplier-version tag
|
||||||
|
// Read the first amount in the "Payment Details" block; accept the currency on either
|
||||||
|
// side of the number and derive it (don't assume EUR), so non-EUR vouchers still get a price.
|
||||||
|
const priceM = t.match(
|
||||||
|
/Payment Details[\s\S]{0,120}?(?:(EUR|USD|GBP|CHF|€|\$|£)\s*([\d.,]+)|([\d.,]+)\s*(EUR|USD|GBP|CHF|€|\$|£))/i,
|
||||||
|
);
|
||||||
|
const price = priceM ? priceM[2] ?? priceM[3] : undefined;
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
type: 'car',
|
||||||
|
...(company ? { operator: company } : {}),
|
||||||
|
booking_reference: ref,
|
||||||
|
from_name: pu[1].trim(),
|
||||||
|
to_name: dof[1].trim(),
|
||||||
|
departure_time: puTime,
|
||||||
|
arrival_time: doTime,
|
||||||
|
...(price ? { price, currency: moneyCurrency(priceM![1] ?? priceM![4]) } : {}),
|
||||||
|
},
|
||||||
|
];
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const TEMPLATES: VendorTemplate[] = [sixt, expedia, brokerRental];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Try each vendor template; return the first match's result, or null when no template
|
* Try each vendor template; return the first match's result, or null when no template
|
||||||
|
|||||||
Reference in New Issue
Block a user