Email import for gmail.. improve

umeshma · umeshma · commit c26a40d7c14f · 2025-10-09T15:41:15.000-07:00
diff --git a/tools/test_email.py b/tools/test_email.py
@@ -10,6 +10,7 @@
 from colorama import Fore
 from pathlib import Path
 import argparse
+import shelve
 
 try:
     import readline  # noqa: F401
@@ -36,11 +37,13 @@ def __init__(
         self, base_path: Path, db_name: str, conversation: EmailMemory
     ) -> None:
         self.base_path = base_path
+        self.db_name = db_name
         self.db_path = base_path.joinpath(db_name)
         self.conversation = conversation
         self.query_translator: (
             typechat.TypeChatJsonTranslator[search_query_schema.SearchQuery] | None
         ) = None
+        self.index_log = load_index_log(str(self.db_path), create_new=False)
 
     def get_translator(self):
         if self.query_translator is None:
@@ -52,17 +55,23 @@ def get_translator(self):
 
     async def load_conversation(self, db_name: str, create_new: bool = False):
         await self.conversation.settings.storage_provider.close()
+        self.db_name = db_name
         self.db_path = self.base_path.joinpath(db_name)
         self.conversation = await load_or_create_email_index(
             str(self.db_path), create_new
         )
+        self.index_log = load_index_log(str(self.db_path), create_new)
 
     # Delete the current conversation and re-create it
     async def restart_conversation(self):
-        await self.conversation.settings.storage_provider.close()
-        self.conversation = await load_or_create_email_index(
-            str(self.db_path), create_new=True
-        )
+        await self.load_conversation(self.db_name, create_new=True)
+
+    def is_indexed(self, email_id: str | None) -> bool:
+        return bool(email_id and self.index_log.get(email_id))
+
+    def log_indexed(self, email_id: str | None) -> None:
+        if email_id is not None:
+            self.index_log[email_id] = True
 
 
 CommandHandler = Callable[[EmailContext, list[str]], Awaitable[None]]
@@ -77,8 +86,6 @@ def decorator(func: Callable):
     return decorator
 
 
-# Just simple test code
-# TODO : Once stable, move creation etc to query.py
 async def main():
 
     if sys.argv[1:2]:
@@ -168,6 +175,10 @@ def _add_messages_def() -> argparse.ArgumentParser:
         default="",
         help="Path to an .eml file or to a directory with .eml files",
     )
+    cmd.add_argument("--ignore_error", type=bool, default=True, help="Ignore errors")
+    cmd.add_argument(
+        "--knowledge", type=bool, default=True, help="Automatically extract knowledge"
+    )
     return cmd
 
 
@@ -180,38 +191,47 @@ async def add_messages(context: EmailContext, args: list[str]):
 
     # Get the path to the email file or directory of emails to ingest
     src_path = Path(named_args.path)
-    emails: list[EmailMessage]
+    emails: Iterable[EmailMessage]
     if src_path.is_file():
         emails = [import_email_from_file(str(src_path))]
     else:
         emails = import_emails_from_dir(str(src_path))
 
-    print(Fore.CYAN, f"Importing {len(emails)} emails".capitalize())
-    print(Fore.RESET)
+    print(Fore.CYAN, f"Importing from {src_path}" + Fore.RESET)
 
-    conversation = context.conversation
-    for email in emails:
-        # print_email(email)
-        # print()
-        # knowledge = email.metadata.get_knowledge()
-        # print_knowledge(knowledge)
+    semantic_settings = context.conversation.settings.semantic_ref_index_settings
+    auto_knowledge = semantic_settings.auto_extract_knowledge
+    try:
+        conversation = context.conversation
+        # Add one at a time for debugging etc.
+        for i, email in enumerate(emails):
+            email_id = email.metadata.id
+            email_src = email.src_url if email.src_url is not None else ""
+            print_progress(i + 1, None, email.src_url)
+            print()
+            if context.is_indexed(email_id):
+                print(Fore.GREEN + email_src + "[Already indexed]" + Fore.RESET)
+                continue
 
-        print(f"From: {email.metadata.sender}\nTo:{email.metadata.recipients}")
-        # await conversation.add_message(email)
-        await conversation.add_messages_with_indexing([email])
-        print("Success")
+            try:
+                await conversation.add_messages_with_indexing([email])
+                context.log_indexed(email_id)
+            except Exception as e:
+                if named_args.ignore_error:
+                    print_error(f"{email.src_url}\n{e}")
+                    print(
+                        Fore.GREEN
+                        + f"ignore_error = {named_args.ignore_error}"
+                        + Fore.RESET
+                    )
+                else:
+                    raise
+    finally:
+        semantic_settings.auto_extract_knowledge = auto_knowledge
 
     await print_conversation_stats(conversation)
 
 
-# async def build_index(context: EmailContext, args: list[str]):
-#    conversation = context.conversation
-#    print(Fore.GREEN, "Building index")
-#    await print_conversation_stats(conversation)
-#    await conversation.build_index()
-#    print(Fore.GREEN + "Built index.")
-
-
 async def search_index(context: EmailContext, args: list[str]):
     if len(args) == 0:
         return
@@ -345,6 +365,14 @@ async def load_or_create_email_index(db_path: str, create_new: bool) -> EmailMem
     return email_memory
 
 
+def load_index_log(db_path: str, create_new: bool) -> shelve.Shelf[Any]:
+    log_path = db_path + ".index_log"
+    index_log = shelve.open(log_path)
+    if create_new:
+        index_log.clear()
+    return index_log
+
+
 def delete_sqlite_db(db_path: str):
     if os.path.exists(db_path):
         os.remove(db_path)  # Delete existing database for clean test
@@ -404,29 +432,6 @@ def print_knowledge(knowledge: kplib.KnowledgeResponse):
     print(Fore.RESET)
 
 
-def print_list(
-    color, list: Iterable[Any], title: str, type: Literal["plain", "ol", "ul"] = "plain"
-):
-    print(color)
-    if title:
-        print(f"# {title}\n")
-    if type == "plain":
-        for item in list:
-            print(item)
-    elif type == "ul":
-        for item in list:
-            print(f"- {item}")
-    elif type == "ol":
-        for i, item in enumerate(list):
-            print(f"{i + 1}. {item}")
-    print(Fore.RESET)
-
-
-def print_error(msg: str):
-    print(Fore.RED + msg)
-    print(Fore.RESET)
-
-
 async def print_conversation_stats(conversation: IConversation):
     print(f"Conversation index stats".upper())
     print(f"Message count: {await conversation.messages.size()}")
@@ -453,6 +458,37 @@ async def print_search_results(
     print(Fore.RESET)
 
 
+def print_list(
+    color, list: Iterable[Any], title: str, type: Literal["plain", "ol", "ul"] = "plain"
+):
+    print(color)
+    if title:
+        print(f"# {title}\n")
+    if type == "plain":
+        for item in list:
+            print(item)
+    elif type == "ul":
+        for item in list:
+            print(f"- {item}")
+    elif type == "ol":
+        for i, item in enumerate(list):
+            print(f"{i + 1}. {item}")
+    print(Fore.RESET)
+
+
+def print_error(msg: str):
+    print(Fore.RED + msg + Fore.RESET)
+
+
+def print_progress(cur: int, total: int | None = None, suffix: str | None = "") -> None:
+    if suffix is None:
+        suffix = ""
+    if total is not None:
+        print(f"[{cur} / {total}] {suffix}\r", end="", flush=True)
+    else:
+        print(f"[{cur}] {suffix}\r", end="", flush=True)
+
+
 if __name__ == "__main__":
     try:
         asyncio.run(main())
diff --git a/typeagent/emails/email_import.py b/typeagent/emails/email_import.py
@@ -14,14 +14,10 @@
 
 def import_emails_from_dir(
     dir_path: str, max_chunk_length: int = 4096
-) -> list[EmailMessage]:
-    messages: list[EmailMessage] = []
+) -> Iterable[EmailMessage]:
     for file_path in Path(dir_path).iterdir():
         if file_path.is_file():
-            messages.append(
-                import_email_from_file(str(file_path.resolve()), max_chunk_length)
-            )
-    return messages
+            yield import_email_from_file(str(file_path.resolve()), max_chunk_length)
 
 
 # Imports an email file (.eml) as a list of EmailMessage objects
@@ -32,7 +28,9 @@ def import_email_from_file(
     with open(file_path, "r") as f:
         email_string = f.read()
 
-    return import_email_string(email_string, max_chunk_length)
+    email = import_email_string(email_string, max_chunk_length)
+    email.src_url = file_path
+    return email
 
 
 # Imports a single email MIME string and returns an EmailMessage object
@@ -65,6 +63,7 @@ def import_email_message(msg: Message, max_chunk_length: int) -> EmailMessage:
         cc=_import_address_headers(msg.get_all("Cc", [])),
         bcc=_import_address_headers(msg.get_all("Bcc", [])),
         subject=msg.get("Subject"),
+        id=msg.get("Message-ID", None),
     )
     timestamp: str | None = None
     timestamp_date = msg.get("Date", None)
diff --git a/typeagent/emails/email_message.py b/typeagent/emails/email_message.py
@@ -28,6 +28,7 @@ class EmailMessageMeta(IKnowledgeSource, IMessageMetadata):
     cc: list[str] = Field(default_factory=list)
     bcc: list[str] = Field(default_factory=list)
     subject: str | None = None
+    id: str | None = None
 
     @property
     def source(self) -> str | None:  # type: ignore[reportIncompatibleVariableOverride]
@@ -155,6 +156,7 @@ def __init__(self, **data: Any) -> None:
         "Tags associated with the message", default_factory=list
     )
     timestamp: str | None = None  # Use metadata.sent_on for the actual sent time
+    src_url: str | None = None  # Source file or uri for this email
 
     def get_knowledge(self) -> kplib.KnowledgeResponse:
         return self.metadata.get_knowledge()