feat(extract): extract data using LLM

This commit is contained in:
jubnl
2026-06-24 18:45:52 +02:00
parent 49fb2fded2
commit 186625591a
79 changed files with 2191 additions and 92 deletions
@@ -0,0 +1,61 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { HttpException } from '@nestjs/common';
import { BookingImportController } from '../../../../src/nest/booking-import/booking-import.controller';
import type { BookingImportService } from '../../../../src/nest/booking-import/booking-import.service';
import type { User } from '../../../../src/types';
const user = { id: 1, role: 'user' } as User;
const file = (name = 'a.pdf') => ({ originalname: name, buffer: Buffer.from('x') } as Express.Multer.File);
function make(over: Partial<BookingImportService> = {}) {
const svc = {
verifyTripAccess: vi.fn(() => ({ user_id: 1 })),
canEdit: vi.fn(() => true),
isAvailable: vi.fn(() => true),
aiAvailable: vi.fn(() => true),
preview: vi.fn(async () => ({ items: [], warnings: [], files: [] })),
...over,
} as unknown as BookingImportService;
return { c: new BookingImportController(svc), svc };
}
async function status(fn: () => Promise<unknown>): Promise<number> {
try { await fn(); } catch (e) { expect(e).toBeInstanceOf(HttpException); return (e as HttpException).getStatus(); }
throw new Error('expected throw');
}
beforeEach(() => vi.clearAllMocks());
describe('BookingImportController.preview', () => {
it('rejects an invalid mode with 400', async () => {
const { c } = make();
expect(await status(() => c.preview(user, 't1', [file()], 'bogus'))).toBe(400);
});
it('returns 409 for force-ai when AI is not configured', async () => {
const { c } = make({ aiAvailable: vi.fn(() => false) as any });
expect(await status(() => c.preview(user, 't1', [file()], 'force-ai'))).toBe(409);
});
it('returns 503 for no-ai when the extractor is unavailable', async () => {
const { c } = make({ isAvailable: vi.fn(() => false) as any });
expect(await status(() => c.preview(user, 't1', [file()], 'no-ai'))).toBe(503);
});
it('returns 400 when no files are uploaded', async () => {
const { c } = make();
expect(await status(() => c.preview(user, 't1', [], 'no-ai'))).toBe(400);
});
it('passes the parsed mode and user id through to the service', async () => {
const { c, svc } = make();
await c.preview(user, 't1', [file()], 'fallback-on-empty');
expect(svc.preview).toHaveBeenCalledWith([expect.anything()], 'fallback-on-empty', 1);
});
it('defaults the mode to no-ai when omitted', async () => {
const { c, svc } = make();
await c.preview(user, 't1', [file()], undefined);
expect(svc.preview).toHaveBeenCalledWith([expect.anything()], 'no-ai', 1);
});
});
@@ -0,0 +1,79 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { HttpException } from '@nestjs/common';
// Mock the heavy side-effect imports so the service module loads cleanly; the
// preview() path under test only touches the extractor + llmParse deps.
vi.mock('../../../../src/db/database', () => ({ db: { prepare: vi.fn() }, closeDb: () => {}, reinitialize: () => {} }));
vi.mock('../../../../src/websocket', () => ({ broadcast: vi.fn() }));
vi.mock('../../../../src/services/permissions', () => ({ checkPermission: vi.fn(() => true) }));
vi.mock('../../../../src/services/tripAccess', () => ({ verifyTripAccess: vi.fn() }));
vi.mock('../../../../src/services/reservationService', () => ({ createReservation: vi.fn() }));
vi.mock('../../../../src/services/placeService', () => ({ createPlace: vi.fn() }));
vi.mock('../../../../src/services/mapsService', () => ({ searchNominatim: vi.fn() }));
import { BookingImportService } from '../../../../src/nest/booking-import/booking-import.service';
const HOTEL_KI = { '@type': 'LodgingReservation', reservationNumber: 'ABC', reservationFor: { name: 'Hotel X' }, checkinTime: '2026-06-11T15:00', checkoutTime: '2026-06-12T11:00' };
const file = (name = 'a.pdf') => ({ buffer: Buffer.from('x'), originalname: name } as any);
function make(opts: { kit?: boolean; ai?: boolean; extract?: any; parse?: any }) {
const extractor = { isAvailable: () => opts.kit ?? false, extract: vi.fn(opts.extract ?? (async () => [])) };
const llmParse = { isAvailable: () => opts.ai ?? false, parse: vi.fn(opts.parse ?? (async () => ({ kiItems: [], warnings: [] }))) };
return { svc: new BookingImportService(extractor as any, llmParse as any), extractor, llmParse };
}
beforeEach(() => vi.clearAllMocks());
describe('BookingImportService.preview', () => {
it('no-ai: maps kitinerary items, does not force needs_review, reports aiUsed:false', async () => {
const { svc, llmParse } = make({ kit: true, ai: false, extract: async () => [HOTEL_KI] });
const res = await svc.preview([file()], 'no-ai', 1);
expect(res.items).toHaveLength(1);
expect(res.items[0].needs_review).toBeFalsy();
expect(res.files).toEqual([{ fileName: 'a.pdf', aiAvailable: false, aiUsed: false }]);
expect(llmParse.parse).not.toHaveBeenCalled();
});
it('throws 503 when neither parser is available', async () => {
const { svc } = make({ kit: false, ai: false });
try {
await svc.preview([file()], 'no-ai', 1);
throw new Error('expected throw');
} catch (err) {
expect(err).toBeInstanceOf(HttpException);
expect((err as HttpException).getStatus()).toBe(503);
}
});
it('fallback-on-empty: runs the LLM when kitinerary finds nothing and flags needs_review', async () => {
const { svc, extractor, llmParse } = make({
kit: true, ai: true,
extract: async () => [],
parse: async () => ({ kiItems: [HOTEL_KI], warnings: [] }),
});
const res = await svc.preview([file()], 'fallback-on-empty', 1);
expect(extractor.extract).toHaveBeenCalled();
expect(llmParse.parse).toHaveBeenCalled();
expect(res.items).toHaveLength(1);
expect(res.items[0].needs_review).toBe(true);
expect(res.files![0]).toEqual({ fileName: 'a.pdf', aiAvailable: true, aiUsed: true });
});
it('fallback-on-empty: skips the LLM when kitinerary already found items', async () => {
const { svc, llmParse } = make({ kit: true, ai: true, extract: async () => [HOTEL_KI] });
const res = await svc.preview([file()], 'fallback-on-empty', 1);
expect(llmParse.parse).not.toHaveBeenCalled();
expect(res.files![0].aiUsed).toBe(false);
});
it('force-ai: skips kitinerary entirely and uses the LLM', async () => {
const { svc, extractor, llmParse } = make({
kit: true, ai: true,
parse: async () => ({ kiItems: [HOTEL_KI], warnings: [] }),
});
const res = await svc.preview([file()], 'force-ai', 1);
expect(extractor.extract).not.toHaveBeenCalled();
expect(llmParse.parse).toHaveBeenCalled();
expect(res.items[0].needs_review).toBe(true);
});
});
@@ -0,0 +1,96 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { OpenAiCompatibleClient } from '../../../../src/nest/llm-parse/clients/openai-compatible.client';
import { AnthropicClient } from '../../../../src/nest/llm-parse/clients/anthropic.client';
import type { LlmExtractionInput } from '../../../../src/nest/llm-parse/llm-provider.interface';
const baseInput: LlmExtractionInput = {
prompt: 'system',
jsonSchema: { type: 'object' },
model: 'm',
text: 'Flight AB123',
};
function mockFetch(impl: (url: string, init: RequestInit) => Promise<Response> | Response) {
const fn = vi.fn(impl as any);
vi.stubGlobal('fetch', fn);
return fn;
}
function jsonResponse(body: unknown, ok = true, status = 200): Response {
return { ok, status, json: async () => body, text: async () => JSON.stringify(body) } as unknown as Response;
}
beforeEach(() => vi.unstubAllGlobals());
describe('OpenAiCompatibleClient', () => {
it('posts to {baseUrl}/chat/completions and returns the reservations array', async () => {
const fetchFn = mockFetch(() =>
jsonResponse({ choices: [{ message: { content: JSON.stringify({ reservations: [{ '@type': 'FlightReservation' }] }) } }] }),
);
const out = await new OpenAiCompatibleClient().extract({ ...baseInput, baseUrl: 'http://localhost:11434/v1/' });
expect(out).toEqual([{ '@type': 'FlightReservation' }]);
expect(fetchFn.mock.calls[0][0]).toBe('http://localhost:11434/v1/chat/completions');
});
it('tolerates code-fenced JSON', async () => {
mockFetch(() =>
jsonResponse({ choices: [{ message: { content: '```json\n{"reservations":[{"@type":"TrainReservation"}]}\n```' } }] }),
);
const out = await new OpenAiCompatibleClient().extract(baseInput);
expect(out).toEqual([{ '@type': 'TrainReservation' }]);
});
it('returns [] on malformed content', async () => {
mockFetch(() => jsonResponse({ choices: [{ message: { content: 'not json' } }] }));
expect(await new OpenAiCompatibleClient().extract(baseInput)).toEqual([]);
});
it('throws on non-2xx', async () => {
mockFetch(() => jsonResponse({ error: 'bad' }, false, 401));
await expect(new OpenAiCompatibleClient().extract(baseInput)).rejects.toThrow(/401/);
});
it('sends an image natively as image_url but never a file/pdf part', async () => {
const fetchFn = mockFetch(() => jsonResponse({ choices: [{ message: { content: '{"reservations":[]}' } }] }));
await new OpenAiCompatibleClient().extract({ ...baseInput, file: { mimeType: 'image/png', data: Buffer.from('IMG') } });
let parts = JSON.parse((fetchFn.mock.calls[0][1] as RequestInit).body as string).messages[1].content;
expect(parts.some((p: any) => p.type === 'image_url')).toBe(true);
expect(parts.some((p: any) => p.type === 'file')).toBe(false);
// A PDF must NOT be sent as a content part (Ollama rejects it).
await new OpenAiCompatibleClient().extract({ ...baseInput, file: { mimeType: 'application/pdf', data: Buffer.from('PDF') } });
parts = JSON.parse((fetchFn.mock.calls[1][1] as RequestInit).body as string).messages[1].content;
expect(parts.every((p: any) => p.type !== 'file' && p.type !== 'image_url')).toBe(true);
});
});
describe('AnthropicClient', () => {
it('forces the emit_reservations tool and reads its input', async () => {
const fetchFn = mockFetch(() =>
jsonResponse({ stop_reason: 'tool_use', content: [{ type: 'tool_use', name: 'emit_reservations', input: { reservations: [{ '@type': 'LodgingReservation' }] } }] }),
);
const out = await new AnthropicClient().extract(baseInput);
expect(out).toEqual([{ '@type': 'LodgingReservation' }]);
const body = JSON.parse((fetchFn.mock.calls[0][1] as RequestInit).body as string);
expect(body.tool_choice).toEqual({ type: 'tool', name: 'emit_reservations' });
expect(body.tools[0].name).toBe('emit_reservations');
});
it('throws on a refusal stop_reason', async () => {
mockFetch(() => jsonResponse({ stop_reason: 'refusal', content: [] }));
await expect(new AnthropicClient().extract(baseInput)).rejects.toThrow(/declined/i);
});
it('throws on non-2xx', async () => {
mockFetch(() => jsonResponse({ error: 'bad' }, false, 500));
await expect(new AnthropicClient().extract(baseInput)).rejects.toThrow(/500/);
});
it('sends a native pdf as a base64 document block', async () => {
const fetchFn = mockFetch(() => jsonResponse({ content: [{ type: 'tool_use', name: 'emit_reservations', input: { reservations: [] } }] }));
await new AnthropicClient().extract({ ...baseInput, file: { mimeType: 'application/pdf', data: Buffer.from('PDF') } });
const body = JSON.parse((fetchFn.mock.calls[0][1] as RequestInit).body as string);
const blocks = body.messages[0].content;
expect(blocks.some((b: any) => b.type === 'document' && b.source.type === 'base64')).toBe(true);
});
});
@@ -0,0 +1,67 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
const { dbMock } = vi.hoisted(() => {
const stmt = { get: vi.fn() };
return { dbMock: { prepare: vi.fn(() => stmt), _stmt: stmt } };
});
vi.mock('../../../../src/db/database', () => ({ db: dbMock, closeDb: () => {}, reinitialize: () => {} }));
const { isAddonEnabled } = vi.hoisted(() => ({ isAddonEnabled: vi.fn() }));
vi.mock('../../../../src/services/adminService', () => ({ isAddonEnabled }));
const { getUserSettings, getDecryptedUserSetting } = vi.hoisted(() => ({
getUserSettings: vi.fn(() => ({}) as Record<string, unknown>),
getDecryptedUserSetting: vi.fn(() => null as string | null),
}));
vi.mock('../../../../src/services/settingsService', () => ({ getUserSettings, getDecryptedUserSetting }));
import { resolveLlmConfig } from '../../../../src/nest/llm-parse/llm-config.resolver';
function setInstanceConfig(config: unknown) {
dbMock._stmt.get.mockReturnValue(config === undefined ? undefined : { config: JSON.stringify(config) });
}
beforeEach(() => {
vi.clearAllMocks();
isAddonEnabled.mockReturnValue(true);
setInstanceConfig(undefined);
getUserSettings.mockReturnValue({});
getDecryptedUserSetting.mockReturnValue(null);
});
describe('resolveLlmConfig', () => {
it('returns null when the addon is disabled', () => {
isAddonEnabled.mockReturnValue(false);
expect(resolveLlmConfig(1)).toBeNull();
});
it('uses instance config when present (and decrypts the key)', () => {
setInstanceConfig({ provider: 'anthropic', model: 'claude-opus-4-8', apiKey: 'sk-plain', multimodal: true });
expect(resolveLlmConfig(1)).toEqual({
provider: 'anthropic',
model: 'claude-opus-4-8',
baseUrl: undefined,
apiKey: 'sk-plain',
multimodal: true,
});
});
it('falls back to per-user config when instance config is incomplete', () => {
setInstanceConfig({ provider: 'anthropic' }); // no model → not usable
getUserSettings.mockReturnValue({ llm_provider: 'local', llm_model: 'nuextract', llm_base_url: 'http://x/v1', llm_multimodal: true });
getDecryptedUserSetting.mockReturnValue('user-key');
expect(resolveLlmConfig(7)).toEqual({
provider: 'local',
model: 'nuextract',
baseUrl: 'http://x/v1',
apiKey: 'user-key',
multimodal: true,
});
expect(getDecryptedUserSetting).toHaveBeenCalledWith(7, 'llm_api_key');
});
it('returns null when neither instance nor user config is usable', () => {
getUserSettings.mockReturnValue({ llm_provider: 'openai' }); // no model
expect(resolveLlmConfig(1)).toBeNull();
});
});
@@ -0,0 +1,60 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
import { HttpException } from '@nestjs/common';
import { LlmLocalService } from '../../../../src/nest/llm-parse/llm-local.service';
const svc = () => new LlmLocalService();
function mockFetch(impl: any) {
const fn = vi.fn(impl);
vi.stubGlobal('fetch', fn);
return fn;
}
beforeEach(() => vi.unstubAllGlobals());
describe('LlmLocalService.ollamaRoot', () => {
it('strips a trailing /v1 and slashes', () => {
expect(svc().ollamaRoot('http://localhost:11434/v1')).toBe('http://localhost:11434');
expect(svc().ollamaRoot('http://localhost:11434/v1/')).toBe('http://localhost:11434');
expect(svc().ollamaRoot('http://host:1/')).toBe('http://host:1');
});
it('defaults when no base URL is given', () => {
expect(svc().ollamaRoot(undefined)).toBe('http://localhost:11434');
});
it('rejects non-http(s) and invalid URLs', () => {
expect(() => svc().ollamaRoot('ftp://x')).toThrow(HttpException);
expect(() => svc().ollamaRoot('not a url')).toThrow(HttpException);
});
});
describe('LlmLocalService.listModels', () => {
it('returns named models from /api/tags', async () => {
const fetchFn = mockFetch(async () => ({ ok: true, json: async () => ({ models: [{ name: 'nuextract', size: 100 }, { name: '' }] }) }));
const out = await svc().listModels('http://localhost:11434/v1');
expect(out.models).toEqual([{ name: 'nuextract', size: 100 }]);
expect(fetchFn.mock.calls[0][0]).toBe('http://localhost:11434/api/tags');
});
it('502s when the server is unreachable', async () => {
mockFetch(async () => { throw new Error('ECONNREFUSED'); });
await expect(svc().listModels('http://localhost:11434')).rejects.toThrow(HttpException);
});
});
describe('LlmLocalService.pull', () => {
it('requires a model', async () => {
await expect(svc().pull('http://localhost:11434', '')).rejects.toThrow(HttpException);
});
it('posts to /api/pull and returns the stream body', async () => {
const body = {} as ReadableStream<Uint8Array>;
const fetchFn = mockFetch(async () => ({ ok: true, body }));
const out = await svc().pull('http://localhost:11434/v1', 'nuextract');
expect(out).toBe(body);
expect(fetchFn.mock.calls[0][0]).toBe('http://localhost:11434/api/pull');
const init = fetchFn.mock.calls[0][1];
expect(JSON.parse(init.body)).toEqual({ model: 'nuextract', stream: true });
});
});
@@ -0,0 +1,116 @@
import { describe, it, expect, vi, beforeEach } from 'vitest';
const { resolveLlmConfig } = vi.hoisted(() => ({ resolveLlmConfig: vi.fn() }));
vi.mock('../../../../src/nest/llm-parse/llm-config.resolver', () => ({ resolveLlmConfig }));
const { createLlmClient, extract } = vi.hoisted(() => {
const extract = vi.fn();
return { createLlmClient: vi.fn(() => ({ extract })), extract };
});
vi.mock('../../../../src/nest/llm-parse/llm-client.factory', () => ({ createLlmClient }));
const { extractText } = vi.hoisted(() => ({ extractText: vi.fn(async () => 'Flight AB123') }));
vi.mock('../../../../src/nest/llm-parse/text-extract', async (orig) => {
const actual = await orig() as Record<string, unknown>;
return { ...actual, extractText };
});
import { LlmParseService } from '../../../../src/nest/llm-parse/llm-parse.service';
const cfg = (over: Record<string, unknown> = {}) => ({ provider: 'openai', model: 'm', multimodal: false, ...over });
const svc = () => new LlmParseService();
const file = (name: string, body = 'Flight AB123') => ({ buffer: Buffer.from(body), originalName: name });
beforeEach(() => {
vi.clearAllMocks();
resolveLlmConfig.mockReturnValue(cfg());
extract.mockResolvedValue([{ '@type': 'FlightReservation' }]);
extractText.mockResolvedValue('Flight AB123');
});
describe('LlmParseService', () => {
it('isAvailable reflects whether a config resolves', () => {
resolveLlmConfig.mockReturnValueOnce(null);
expect(svc().isAvailable(1)).toBe(false);
expect(svc().isAvailable(1)).toBe(true);
});
it('returns a not-configured warning when no config resolves', async () => {
resolveLlmConfig.mockReturnValue(null);
const res = await svc().parse(file('a.txt'), 1);
expect(res.kiItems).toEqual([]);
expect(res.warnings[0]).toMatch(/not configured/i);
expect(extract).not.toHaveBeenCalled();
});
it('sends extracted text for a text-like file', async () => {
const res = await svc().parse(file('a.txt'), 1);
expect(res.kiItems).toEqual([{ '@type': 'FlightReservation' }]);
const input = extract.mock.calls[0][0];
expect(input.text).toBe('Flight AB123');
expect(input.file).toBeUndefined();
});
it('extracts text for a pdf on the OpenAI-compatible/local path (no native bytes)', async () => {
extractText.mockResolvedValue('Hotel X');
await svc().parse(file('a.pdf', '%PDF'), 1);
const input = extract.mock.calls[0][0];
expect(input.text).toBe('Hotel X');
expect(input.file).toBeUndefined();
});
it('sends a pdf as native bytes only for Anthropic', async () => {
resolveLlmConfig.mockReturnValue(cfg({ provider: 'anthropic' }));
await svc().parse(file('a.pdf', '%PDF'), 1);
const input = extract.mock.calls[0][0];
expect(input.file).toEqual({ mimeType: 'application/pdf', data: expect.any(Buffer) });
expect(input.text).toBeUndefined();
expect(extractText).not.toHaveBeenCalled();
});
it('warns when a pdf yields no readable text (e.g. a scan)', async () => {
extractText.mockResolvedValue(' ');
const res = await svc().parse(file('a.pdf', '%PDF'), 1);
expect(res.kiItems).toEqual([]);
expect(res.warnings[0]).toMatch(/no readable text/i);
expect(extract).not.toHaveBeenCalled();
});
it('folds flattened type fields into reservationFor (small-model output)', async () => {
extract.mockResolvedValue([{
'@type': 'FlightReservation',
reservationNumber: 'ABC',
flightNumber: 'EZY1357',
airline: { iataCode: 'EG' },
departureAirport: { iataCode: 'GEG' },
arrivalAirport: { iataCode: 'AMS' },
departureTime: '2026-06-11T10:00:00',
}]);
const res = await svc().parse(file('a.txt'), 1);
const item = res.kiItems[0] as any;
expect(item.reservationNumber).toBe('ABC');
expect(item.reservationFor).toMatchObject({ flightNumber: 'EZY1357', departureAirport: { iataCode: 'GEG' } });
// root-level keys are not duplicated into reservationFor
expect(item.reservationFor.reservationNumber).toBeUndefined();
});
it('leaves already-nested reservationFor untouched', async () => {
extract.mockResolvedValue([{ '@type': 'FlightReservation', reservationFor: { flightNumber: 'X1' } }]);
const res = await svc().parse(file('a.txt'), 1);
expect((res.kiItems[0] as any).reservationFor).toEqual({ flightNumber: 'X1' });
});
it('drops nodes without a string @type and warns', async () => {
extract.mockResolvedValue([{ '@type': 'FlightReservation' }, { foo: 'bar' }]);
const res = await svc().parse(file('a.txt'), 1);
expect(res.kiItems).toEqual([{ '@type': 'FlightReservation' }]);
expect(res.warnings.some(w => /unrecognized/i.test(w))).toBe(true);
});
it('degrades to a warning when the client throws', async () => {
extract.mockRejectedValue(new Error('boom'));
const res = await svc().parse(file('a.txt'), 1);
expect(res.kiItems).toEqual([]);
expect(res.warnings[0]).toMatch(/AI parsing failed/i);
});
});
@@ -0,0 +1,26 @@
import { describe, it, expect } from 'vitest';
import { buildSystemPrompt, KI_RESERVATION_JSON_SCHEMA } from '../../../../src/nest/llm-parse/llm-prompt';
import { KI_RESERVATION_TYPES } from '@trek/shared';
describe('llm-prompt', () => {
it('names every recognized @type the mapper supports', () => {
const prompt = buildSystemPrompt();
for (const t of KI_RESERVATION_TYPES) expect(prompt).toContain(t);
});
it('instructs JSON-only output wrapped in reservations', () => {
const prompt = buildSystemPrompt();
expect(prompt).toMatch(/"reservations"/);
expect(prompt.toLowerCase()).toContain('iso 8601');
});
it('exposes a strict-safe object-root JSON schema enumerating the types', () => {
const schema = KI_RESERVATION_JSON_SCHEMA as any;
expect(schema.type).toBe('object');
expect(schema.additionalProperties).toBe(false);
expect(schema.required).toContain('reservations');
const item = schema.properties.reservations.items;
expect(item.properties['@type'].enum).toEqual([...KI_RESERVATION_TYPES]);
expect(item.required).toContain('@type');
});
});
@@ -0,0 +1,40 @@
import { describe, it, expect, vi } from 'vitest';
const { getText } = vi.hoisted(() => ({ getText: vi.fn(async () => ({ text: 'Hotel X — confirmation ABC' })) }));
vi.mock('pdf-parse', () => ({
PDFParse: class {
getText = getText;
destroy = vi.fn(async () => {});
},
}));
import { isTextLike, isPdf, extractText } from '../../../../src/nest/llm-parse/text-extract';
describe('text-extract', () => {
it('classifies text-like and pdf extensions', () => {
expect(isTextLike('a.txt')).toBe(true);
expect(isTextLike('a.html')).toBe(true);
expect(isTextLike('a.eml')).toBe(true);
expect(isTextLike('a.pdf')).toBe(false);
expect(isPdf('a.PDF')).toBe(true);
expect(isPdf('a.txt')).toBe(false);
});
it('decodes plain text', async () => {
expect(await extractText(Buffer.from('hello world'), 'a.txt')).toBe('hello world');
});
it('strips markup from html/eml', async () => {
const html = '<html><style>x{}</style><body><p>Flight AB123</p><script>1</script></body></html>';
const out = await extractText(Buffer.from(html), 'a.html');
expect(out).toContain('Flight AB123');
expect(out).not.toContain('<p>');
expect(out).not.toContain('x{}');
});
it('extracts the embedded text layer from a pdf', async () => {
const out = await extractText(Buffer.from('%PDF-1.4'), 'a.pdf');
expect(out).toBe('Hotel X — confirmation ABC');
expect(getText).toHaveBeenCalled();
});
});