diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 3aca42016acd..ffdbf9925909 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -3406,6 +3406,17 @@ "default": "./dist/experimental/llms/chrome_ai.cjs" } }, + "./chat_loaders/whatsapp": { + "input": "./src/chat_loaders/whatsapp.ts", + "import": { + "types": "./dist/chat_loaders/whatsapp.d.ts", + "default": "./dist/chat_loaders/whatsapp.js" + }, + "require": { + "types": "./dist/chat_loaders/whatsapp.d.cts", + "default": "./dist/chat_loaders/whatsapp.cjs" + } + }, "./experimental/tools/pyinterpreter": { "input": "./src/experimental/tools/pyinterpreter.ts", "import": { diff --git a/libs/langchain-community/src/chat_loaders/tests/example_data/whatsapp/whatsapp_chat.txt b/libs/langchain-community/src/chat_loaders/tests/example_data/whatsapp/whatsapp_chat.txt new file mode 100644 index 000000000000..66e66c1ca2f4 --- /dev/null +++ b/libs/langchain-community/src/chat_loaders/tests/example_data/whatsapp/whatsapp_chat.txt @@ -0,0 +1,13 @@ +[8/15/23, 9:12:33 AM] Dr. Feather: ‎Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. +[8/15/23, 9:12:43 AM] Dr. Feather: I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature! +‎[8/15/23, 9:12:48 AM] Dr. Feather: ‎image omitted +[8/15/23, 9:13:15 AM] Jungle Jane: That's stunning! Were you able to observe its behavior? +‎[8/15/23, 9:13:23 AM] Dr. Feather: ‎image omitted +[8/15/23, 9:14:02 AM] Dr. Feather: Yes, it seemed quite social with other macaws. They're known for their playful nature. +[8/15/23, 9:14:15 AM] Jungle Jane: How's the research going on parrot communication? +‎[8/15/23, 9:14:30 AM] Dr. Feather: ‎image omitted +[8/15/23, 9:14:50 AM] Dr. Feather: It's progressing well. We're learning so much about how they use sound and color to communicate. +[8/15/23, 9:15:10 AM] Jungle Jane: That's fascinating! Can't wait to read your paper on it. +[8/15/23, 9:15:20 AM] Dr. Feather: Thank you! I'll send you a draft soon. +[8/15/23, 9:25:16 PM] Jungle Jane: Looking forward to it! Keep up the great work. + diff --git a/libs/langchain-community/src/chat_loaders/tests/whatsapp.test.ts b/libs/langchain-community/src/chat_loaders/tests/whatsapp.test.ts new file mode 100644 index 000000000000..0ff7811994b8 --- /dev/null +++ b/libs/langchain-community/src/chat_loaders/tests/whatsapp.test.ts @@ -0,0 +1,22 @@ +import * as url from "node:url"; +import * as path from "node:path"; +import { test, expect } from "@jest/globals"; +import { WhatsAppChatLoader } from "../whatsapp.js"; + +test("WhatsAppChatLoader parses WhatsApp export preserving timestamps and senders", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/whatsapp/whatsapp_chat.txt" + ); + + const loader = new WhatsAppChatLoader(filePath); + const sessions = await loader.load(); + expect(sessions.length).toBe(1); + const { messages } = sessions[0]; + expect(messages.length).toBeGreaterThan(0); + // Mirror Python assertion: content contains the macaw sentence + const first = messages[0]; + expect(first.text).toContain( + "I spotted a rare Hyacinth Macaw yesterday in the Amazon Rainforest. Such a magnificent creature!" + ); +}); diff --git a/libs/langchain-community/src/chat_loaders/whatsapp.ts b/libs/langchain-community/src/chat_loaders/whatsapp.ts new file mode 100644 index 000000000000..7de1af2a7c61 --- /dev/null +++ b/libs/langchain-community/src/chat_loaders/whatsapp.ts @@ -0,0 +1,126 @@ +import type { readFile as ReadFileT } from "node:fs/promises"; +import { BaseMessage, HumanMessage } from "@langchain/core/messages"; +import { getEnv } from "@langchain/core/utils/env"; + +export type ChatSession = { + messages: BaseMessage[]; + // Reserved for future parity with Python ChatSession + // functions?: Array>; +}; + +/** + * Loader for WhatsApp chat export .txt files (without media). + * + * Parity with Python implementation: + * - Single regex with optional brackets and leading LRM + * - Slash-separated dates, 12-hour times with seconds and AM/PM + * - Multi-line messages joined with a single space + * - Exact ignore list and LRM-tolerant matching + */ +export class WhatsAppChatLoader { + constructor(public filePathOrBlob: string | Blob) {} + + async load(): Promise { + const sessions: ChatSession[] = []; + for await (const session of this.lazyLoad()) { + sessions.push(session); + } + return sessions; + } + + async *lazyLoad(): AsyncGenerator { + const text = await this.readAllText(); + yield this.parse(text); + } + + protected parse(raw: string): ChatSession { + const messages: BaseMessage[] = []; + + const ignoreLines = [ + "This message was deleted", + "", + "image omitted", + "Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them.", + ]; + const escapeRe = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + const ignoreRe = new RegExp( + `(${ignoreLines.map((s) => `\\u200E*${escapeRe(s)}`).join("|")})`, + "i" + ); + + const messageLineRe = new RegExp( + String.raw`^\u200E*\[?(\d{1,2}\/\d{1,2}\/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)$`, + "i" + ); + + // Split messages by newlines but keep multi-line messages grouped + const chatLines: string[] = []; + let currentMessage = ""; + for (const line of raw.split("\n")) { + if (messageLineRe.test(line)) { + if (currentMessage) chatLines.push(currentMessage); + currentMessage = line; + } else { + currentMessage += ` ${line.trim()}`; + } + } + if (currentMessage) chatLines.push(currentMessage); + + for (const line of chatLines) { + const result = messageLineRe.exec(line.trim()); + if (result) { + const [, timestamp, sender, text] = result; + if (!ignoreRe.test(text.trim())) { + messages.push( + new HumanMessage({ + content: text, + name: sender, + additional_kwargs: { + sender, + events: [{ message_time: timestamp }], + }, + }) + ); + } + } else { + // Mirror Python's debug logging for unparsable lines + console.debug(`Could not parse line: ${line}`); + } + } + + return { messages }; + } + + protected async readAllText(): Promise { + if (typeof this.filePathOrBlob === "string") { + const { readFile } = await WhatsAppChatLoader.imports(); + try { + return await readFile(this.filePathOrBlob, "utf8"); + } catch (e) { + console.error(e); + throw new Error("Failed to read file"); + } + } else { + try { + return await this.filePathOrBlob.text(); + } catch (e) { + console.error(e); + throw new Error("Failed to read blob"); + } + } + } + + static async imports(): Promise<{ + readFile: typeof ReadFileT; + }> { + try { + const { readFile } = await import("node:fs/promises"); + return { readFile }; + } catch (e) { + console.error(e); + throw new Error( + `Failed to load fs/promises. WhatsAppChatLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'.` + ); + } + } +} diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts index 1886f66c420d..166c560ab3d1 100644 --- a/libs/langchain-community/src/load/import_map.ts +++ b/libs/langchain-community/src/load/import_map.ts @@ -247,5 +247,6 @@ export * as experimental__callbacks__handlers__datadog from "../experimental/cal export * as experimental__graph_transformers__llm from "../experimental/graph_transformers/llm.js"; export * as experimental__multimodal_embeddings__googlevertexai from "../experimental/multimodal_embeddings/googlevertexai.js"; export * as experimental__llms__chrome_ai from "../experimental/llms/chrome_ai.js"; +export * as chat_loaders__whatsapp from "../chat_loaders/whatsapp.js"; export * as experimental__tools__pyinterpreter from "../experimental/tools/pyinterpreter.js"; export * as chains__graph_qa__cypher from "../chains/graph_qa/cypher.js";