added OCR for image text reading

codinglabsong · codinglabsong · commit fbd09b622152 · 2025-07-28T13:55:00.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,7 @@ wandb/
 
 # datasets
 data/
+old-data/
 
 # outputs
 outputs/
diff --git a/README.md b/README.md
@@ -0,0 +1,6 @@
+# available file formats
+text files, images (with text through OCR)
+
+# needed for image OCR
+sudo apt update
+sudo apt install -y tesseract-ocr libtesseract-dev
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,4 @@ pypdf
 python-dotenv
 pinecone
 langgraph
-unstructured[pdf,docx,pptx,md]
+unstructured[pdf,docx,pptx,md,image]
diff --git a/src/any_chatbot/agent.py b/src/any_chatbot/agent.py
@@ -46,10 +46,9 @@
 config = {"configurable": {"thread_id": random.random()}}
 
 input_message = (
-    "First retrieve what the revenue for Nike in 2023 was using the functional call.\n\n"
-    "Once you get the answer, do a second retrieve to tell me which distribution centers nike have.\n\n"
-    "Once you get the second answer,, tell me how many employees nike has. You can retreive MULTIPLE TIMES\n\n"
-    "Base your answers only on the retrieved information thorugh the functional call you have."
+    "What is the content of the image?\n\n"
+    "When you don't know while files the user is talking about, use the functional call to retrieve what data is available with a general prompt.\n\n"
+    "Base your answers only on the retrieved information thorugh the functional call you have. You can retreive MULTIPLE TIMES"
 )
 
 for event in agent_executor.stream(
diff --git a/src/any_chatbot/indexing.py b/src/any_chatbot/indexing.py
@@ -27,12 +27,16 @@ def index_text_docs(
             "**/*.md",
             "**/*.html",
             "**/*.txt",
+            "**/*.png",
+            "**/*.jpg",
+            "**/*.jpeg",
+            "**/*.tiff",
         ],
         loader_cls=UnstructuredFileLoader
     )
-    print(f"Loading docs from {data_pth}")
+    print(f"Loading files from {data_pth}")
     docs = loader.load()
-    print(f"Loaded {len(docs)} docs")
+    print(f"Loaded {len(docs)} files")
 
     # Split the texts
     text_splitter = RecursiveCharacterTextSplitter(