Skip to content

Commit f7e92e1

Browse files
committed
support google cloud storage
1 parent f1c6185 commit f7e92e1

File tree

6 files changed

+119
-8
lines changed

6 files changed

+119
-8
lines changed

fastchat/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
IMAGE_MODERATION_MSG = (
2626
"$MODERATION$ YOUR IMAGE VIOLATES OUR CONTENT MODERATION GUIDELINES."
2727
)
28+
PDF_MODERATION_MSG = (
29+
"$MODERATION$ YOUR PDF VIOLATES OUR CONTENT MODERATION GUIDELINES."
30+
)
2831
MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES."
2932
CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
3033
INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."

fastchat/serve/gradio_block_arena_vision.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,44 @@ def wrap_pdfchat_query(query, document):
260260
}
261261

262262

263+
# TODO: P1: Integrate this.
264+
def pdf_moderator(images):
265+
import base64
266+
from openai import OpenAI
267+
from io import BytesIO
268+
269+
base64_urls = []
270+
for image in images:
271+
buffer = BytesIO()
272+
image.save(buffer, format="JPEG")
273+
274+
image_bytes = buffer.getvalue()
275+
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
276+
277+
# convert to openai format
278+
base64_urls.append({
279+
"type": "image_url",
280+
"image_url": {
281+
"url": f"data:image/jpeg;base64,{image_b64}",
282+
}
283+
})
284+
285+
# OpenAI's maximum number of images is 1 at the moment.
286+
client = OpenAI()
287+
moderations = []
288+
for url in base64_urls:
289+
try:
290+
response = client.moderations.create(
291+
model="omni-moderation-latest",
292+
input=url,
293+
)
294+
moderations.append(response[0].results.flagged)
295+
except Exception as e:
296+
print(e)
297+
298+
return all(moderations)
299+
300+
263301
def detect_language_from_doc(pdf_file_path):
264302
from pdf2image import convert_from_path
265303
from polyglot.detect import Detector
@@ -272,6 +310,7 @@ def detect_language_from_doc(pdf_file_path):
272310

273311
# Convert pdf into image (first page only for efficiency)
274312
images = convert_from_path(pdf_file_path)
313+
275314
extracted_text = pytesseract.image_to_string(
276315
images[0], lang=TESSERACT_SUPPORTED_LANGS
277316
)
@@ -291,8 +330,6 @@ def parse_pdf(file_path):
291330
doc_lang = detect_language_from_doc(file_path)
292331
doc_lang = LLAMAPARSE_SUPPORTED_LANGS[doc_lang[0]]
293332

294-
print(doc_lang)
295-
296333
for _ in range(LLAMA_PARSE_MAX_RETRY):
297334
try:
298335
documents = LlamaParse(

fastchat/serve/gradio_block_arena_vision_anony.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from fastchat.constants import (
1414
TEXT_MODERATION_MSG,
1515
IMAGE_MODERATION_MSG,
16+
PDF_MODERATION_MSG,
1617
MODERATION_MSG,
1718
CONVERSATION_LIMIT_MSG,
1819
SLOW_MODEL_MSG,
@@ -77,6 +78,8 @@
7778
build_logger,
7879
moderation_filter,
7980
image_moderation_filter,
81+
upload_pdf_file_to_gcs,
82+
hash_pdf,
8083
)
8184

8285
logger = build_logger("gradio_web_server_multi", "gradio_web_server_multi.log")
@@ -297,10 +300,26 @@ def add_text(
297300
PDFCHAT_SAMPLING_WEIGHTS,
298301
PDFCHAT_SAMPLING_BOOST_MODELS,
299302
)
303+
304+
# Save an unique id for mapping conversation back to the file on google cloud.
305+
unique_id = hash_pdf(pdfs[0])
306+
300307
states = [
301-
State(model_left, is_vision=False),
302-
State(model_right, is_vision=False),
308+
State(
309+
model_left,
310+
is_vision=False,
311+
pdf_id=unique_id
312+
),
313+
State(
314+
model_right,
315+
is_vision=False,
316+
pdf_id=unique_id
317+
),
303318
]
319+
upload_pdf_file_to_gcs(
320+
pdf_file_path=pdfs[0],
321+
filename=unique_id,
322+
)
304323
else:
305324
model_left, model_right = get_battle_pair(
306325
context.all_text_models,
@@ -309,7 +328,6 @@ def add_text(
309328
SAMPLING_WEIGHTS,
310329
SAMPLING_BOOST_MODELS,
311330
)
312-
313331
states = [
314332
State(model_left, is_vision=False),
315333
State(model_right, is_vision=False),
@@ -333,12 +351,11 @@ def add_text(
333351

334352
images = convert_images_to_conversation_format(images)
335353

354+
# TODO: add PDF moderator
336355
text, image_flagged, csam_flag = moderate_input(
337356
state0, text, text, model_list, images, ip
338357
)
339358

340-
# TODO: add PDF moderator
341-
342359
conv = states[0].conv
343360
if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT:
344361
logger.info(f"conversation turn limit. ip: {get_ip(request)}. text: {text}")

fastchat/serve/gradio_web_server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,14 @@
112112

113113

114114
class State:
115-
def __init__(self, model_name, is_vision=False):
115+
def __init__(self, model_name, is_vision=False, pdf_id=None):
116116
self.conv = get_conversation_template(model_name)
117117
self.conv_id = uuid.uuid4().hex
118118
self.skip_next = False
119119
self.model_name = model_name
120120
self.oai_thread_id = None
121121
self.is_vision = is_vision
122+
self.pdf_id = pdf_id # NOTE(Tim): Version 1 PDFChat Architecture, could be revised later.
122123

123124
# NOTE(chris): This could be sort of a hack since it assumes the user only uploads one image. If they can upload multiple, we should store a list of image hashes.
124125
self.has_csam_image = False
@@ -151,6 +152,7 @@ def dict(self):
151152
{
152153
"conv_id": self.conv_id,
153154
"model_name": self.model_name,
155+
"pdf_id": self.pdf_id,
154156
}
155157
)
156158

fastchat/serve/setup_pdfchat.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env bash
2+
3+
# Install Python packages
4+
pip install llama-index-core llama-parse llama-index-readers-file python-dotenv
5+
pip install polyglot
6+
pip install PyICU
7+
pip install pycld2
8+
pip install pytesseract
9+
10+
pip install pdf2image
11+
12+
# Clone the Tesseract tessdata repository
13+
git clone https://github.com/tesseract-ocr/tessdata
14+
15+
# cd into tessdata and set TESSDATA_PREFIX to the current directory
16+
cd tessdata
17+
export TESSDATA_PREFIX="$(pwd)"
18+
19+
echo "TESSDATA_PREFIX is set to: $TESSDATA_PREFIX"

fastchat/utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,20 @@ def upload_image_file_to_gcs(image, filename):
430430
return blob.public_url
431431

432432

433+
def upload_pdf_file_to_gcs(pdf_file_path, filename):
434+
from google.cloud import storage
435+
436+
storage_client = storage.Client()
437+
# upload file to GCS
438+
bucket = storage_client.get_bucket("arena-pdf-dev")
439+
440+
blob = bucket.blob(filename)
441+
# Automatically opens the file in binary read mode
442+
blob.upload_from_filename(pdf_file_path, content_type="application/pdf")
443+
444+
return blob.public_url
445+
446+
433447
def get_image_file_from_gcs(filename):
434448
from google.cloud import storage
435449

@@ -441,6 +455,25 @@ def get_image_file_from_gcs(filename):
441455
return contents
442456

443457

458+
def get_pdf_file_from_gcs(filename):
459+
from google.cloud import storage
460+
461+
storage_client = storage.Client()
462+
bucket = storage_client.get_bucket("arena-pdf-dev")
463+
blob = bucket.blob(f"{filename}")
464+
contents = blob.download_as_bytes()
465+
466+
return contents
467+
468+
469+
def hash_pdf(file_path):
470+
import hashlib
471+
472+
with open(file_path, 'rb') as f:
473+
file_content = f.read()
474+
return hashlib.md5(file_content).hexdigest()
475+
476+
444477
def image_moderation_request(image_bytes, endpoint, api_key):
445478
headers = {"Content-Type": "image/jpeg", "Ocp-Apim-Subscription-Key": api_key}
446479

0 commit comments

Comments
 (0)