support google cloud storage

CodingWithTim · CodingWithTim · commit f7e92e141b9f · 2025-01-04T20:55:39.000Z
diff --git a/fastchat/constants.py b/fastchat/constants.py
@@ -25,6 +25,9 @@
 IMAGE_MODERATION_MSG = (
     "$MODERATION$ YOUR IMAGE VIOLATES OUR CONTENT MODERATION GUIDELINES."
 )
+PDF_MODERATION_MSG = (
+    "$MODERATION$ YOUR PDF VIOLATES OUR CONTENT MODERATION GUIDELINES."
+)
 MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES."
 CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
 INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
diff --git a/fastchat/serve/gradio_block_arena_vision.py b/fastchat/serve/gradio_block_arena_vision.py
@@ -260,6 +260,44 @@ def wrap_pdfchat_query(query, document):
 }
 
 
+# TODO: P1: Integrate this.
+def pdf_moderator(images):
+    import base64
+    from openai import OpenAI
+    from io import BytesIO
+    
+    base64_urls = []
+    for image in images:
+        buffer = BytesIO()
+        image.save(buffer, format="JPEG")
+        
+        image_bytes = buffer.getvalue()
+        image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+
+        # convert to openai format
+        base64_urls.append({
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_b64}",
+            }
+        })
+    
+    # OpenAI's maximum number of images is 1 at the moment.
+    client = OpenAI()
+    moderations = []
+    for url in base64_urls:
+        try:
+            response = client.moderations.create(
+                model="omni-moderation-latest",
+                input=url,
+            )
+            moderations.append(response[0].results.flagged)
+        except Exception as e:
+            print(e)
+            
+    return all(moderations)
+
+
 def detect_language_from_doc(pdf_file_path):
     from pdf2image import convert_from_path
     from polyglot.detect import Detector
@@ -272,6 +310,7 @@ def detect_language_from_doc(pdf_file_path):
 
     # Convert pdf into image (first page only for efficiency)
     images = convert_from_path(pdf_file_path)
+    
     extracted_text = pytesseract.image_to_string(
         images[0], lang=TESSERACT_SUPPORTED_LANGS
     )
@@ -291,8 +330,6 @@ def parse_pdf(file_path):
     doc_lang = detect_language_from_doc(file_path)
     doc_lang = LLAMAPARSE_SUPPORTED_LANGS[doc_lang[0]]
 
-    print(doc_lang)
-
     for _ in range(LLAMA_PARSE_MAX_RETRY):
         try:
             documents = LlamaParse(
diff --git a/fastchat/serve/gradio_block_arena_vision_anony.py b/fastchat/serve/gradio_block_arena_vision_anony.py
@@ -13,6 +13,7 @@
 from fastchat.constants import (
     TEXT_MODERATION_MSG,
     IMAGE_MODERATION_MSG,
+    PDF_MODERATION_MSG,
     MODERATION_MSG,
     CONVERSATION_LIMIT_MSG,
     SLOW_MODEL_MSG,
@@ -77,6 +78,8 @@
     build_logger,
     moderation_filter,
     image_moderation_filter,
+    upload_pdf_file_to_gcs,
+    hash_pdf,
 )
 
 logger = build_logger("gradio_web_server_multi", "gradio_web_server_multi.log")
@@ -297,10 +300,26 @@ def add_text(
                 PDFCHAT_SAMPLING_WEIGHTS,
                 PDFCHAT_SAMPLING_BOOST_MODELS,
             )
+            
+            # Save an unique id for mapping conversation back to the file on google cloud.
+            unique_id = hash_pdf(pdfs[0])
+            
             states = [
-                State(model_left, is_vision=False),
-                State(model_right, is_vision=False),
+                State(
+                    model_left, 
+                    is_vision=False, 
+                    pdf_id=unique_id
+                ),
+                State(
+                    model_right, 
+                    is_vision=False, 
+                    pdf_id=unique_id
+                ),
             ]
+            upload_pdf_file_to_gcs(
+                pdf_file_path=pdfs[0],
+                filename=unique_id,
+            )
         else:
             model_left, model_right = get_battle_pair(
                 context.all_text_models,
@@ -309,7 +328,6 @@ def add_text(
                 SAMPLING_WEIGHTS,
                 SAMPLING_BOOST_MODELS,
             )
-
             states = [
                 State(model_left, is_vision=False),
                 State(model_right, is_vision=False),
@@ -333,12 +351,11 @@ def add_text(
 
     images = convert_images_to_conversation_format(images)
 
+    # TODO: add PDF moderator
     text, image_flagged, csam_flag = moderate_input(
         state0, text, text, model_list, images, ip
     )
 
-    # TODO: add PDF moderator
-
     conv = states[0].conv
     if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT:
         logger.info(f"conversation turn limit. ip: {get_ip(request)}. text: {text}")
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
@@ -112,13 +112,14 @@
 
 
 class State:
-    def __init__(self, model_name, is_vision=False):
+    def __init__(self, model_name, is_vision=False, pdf_id=None):
         self.conv = get_conversation_template(model_name)
         self.conv_id = uuid.uuid4().hex
         self.skip_next = False
         self.model_name = model_name
         self.oai_thread_id = None
         self.is_vision = is_vision
+        self.pdf_id = pdf_id # NOTE(Tim): Version 1 PDFChat Architecture, could be revised later.
 
         # NOTE(chris): This could be sort of a hack since it assumes the user only uploads one image. If they can upload multiple, we should store a list of image hashes.
         self.has_csam_image = False
@@ -151,6 +152,7 @@ def dict(self):
             {
                 "conv_id": self.conv_id,
                 "model_name": self.model_name,
+                "pdf_id": self.pdf_id,
             }
         )
 
diff --git a/fastchat/serve/setup_pdfchat.sh b/fastchat/serve/setup_pdfchat.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Install Python packages
+pip install llama-index-core llama-parse llama-index-readers-file python-dotenv
+pip install polyglot
+pip install PyICU
+pip install pycld2
+pip install pytesseract
+
+pip install pdf2image
+
+# Clone the Tesseract tessdata repository
+git clone https://github.com/tesseract-ocr/tessdata
+
+# cd into tessdata and set TESSDATA_PREFIX to the current directory
+cd tessdata
+export TESSDATA_PREFIX="$(pwd)"
+
+echo "TESSDATA_PREFIX is set to: $TESSDATA_PREFIX"
diff --git a/fastchat/utils.py b/fastchat/utils.py
@@ -430,6 +430,20 @@ def upload_image_file_to_gcs(image, filename):
     return blob.public_url
 
 
+def upload_pdf_file_to_gcs(pdf_file_path, filename):
+    from google.cloud import storage
+
+    storage_client = storage.Client()
+    # upload file to GCS
+    bucket = storage_client.get_bucket("arena-pdf-dev")
+
+    blob = bucket.blob(filename)
+    # Automatically opens the file in binary read mode
+    blob.upload_from_filename(pdf_file_path, content_type="application/pdf")
+
+    return blob.public_url
+
+
 def get_image_file_from_gcs(filename):
     from google.cloud import storage
 
@@ -441,6 +455,25 @@ def get_image_file_from_gcs(filename):
     return contents
 
 
+def get_pdf_file_from_gcs(filename):
+    from google.cloud import storage
+
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket("arena-pdf-dev")
+    blob = bucket.blob(f"{filename}")
+    contents = blob.download_as_bytes()
+
+    return contents
+
+
+def hash_pdf(file_path):
+    import hashlib
+    
+    with open(file_path, 'rb') as f:
+        file_content = f.read()
+    return hashlib.md5(file_content).hexdigest()
+
+
 def image_moderation_request(image_bytes, endpoint, api_key):
     headers = {"Content-Type": "image/jpeg", "Ocp-Apim-Subscription-Key": api_key}
 

Original file line number	Diff line number	Diff line change
`@@ -112,13 +112,14 @@`
`112`	`112`
`113`	`113`
`114`	`114`	`class State:`
`115`		`- def __init__(self, model_name, is_vision=False):`
	`115`	`+ def __init__(self, model_name, is_vision=False, pdf_id=None):`
`116`	`116`	`self.conv = get_conversation_template(model_name)`
`117`	`117`	`self.conv_id = uuid.uuid4().hex`
`118`	`118`	`self.skip_next = False`
`119`	`119`	`self.model_name = model_name`
`120`	`120`	`self.oai_thread_id = None`
`121`	`121`	`self.is_vision = is_vision`
	`122`	`+ self.pdf_id = pdf_id # NOTE(Tim): Version 1 PDFChat Architecture, could be revised later.`
`122`	`123`
`123`	`124`	`# NOTE(chris): This could be sort of a hack since it assumes the user only uploads one image. If they can upload multiple, we should store a list of image hashes.`
`124`	`125`	`self.has_csam_image = False`
`@@ -151,6 +152,7 @@ def dict(self):`
`151`	`152`	`{`
`152`	`153`	`"conv_id": self.conv_id,`
`153`	`154`	`"model_name": self.model_name,`
	`155`	`+ "pdf_id": self.pdf_id,`
`154`	`156`	`}`
`155`	`157`	`)`
`156`	`158`