Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 98 additions & 73 deletions make_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from copy import copy
from os import environ as env
from pathlib import Path
import asyncio

import openai
import requests
Expand All @@ -26,13 +27,21 @@ def __init__(self, key, language, api_base=None):
self.key = key
self.language = language
self.current_key_index = 0
self.key_lock = asyncio.Lock()

def get_key(self, key_str):
keys = key_str.split(",")
key = keys[self.current_key_index]
self.current_key_index = (self.current_key_index + 1) % len(keys)
return key

async def get_key_async(self, key_str):
async with self.key_lock:
keys = key_str.split(",")
key = keys[self.current_key_index]
self.current_key_index = (self.current_key_index + 1) % len(keys)
return key

@abstractmethod
def translate(self, text):
pass
Expand Down Expand Up @@ -88,53 +97,38 @@ def __init__(self, key, language, api_base=None):
if api_base:
openai.api_base = api_base

def translate(self, text):
async def translate_async(self, text):
print(text)
openai.api_key = self.get_key(self.key)
try:
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
# english prompt here to save tokens
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
}
],
)
t_text = (
completion["choices"][0]
.get("message")
.get("content")
.encode("utf8")
.decode()
)
if not NO_LIMIT:
# for time limit
time.sleep(3)
except Exception as e:
# TIME LIMIT for open api please pay
key_len = self.key.count(",") + 1
sleep_time = int(60 / key_len)
time.sleep(sleep_time)
print(str(e), "will sleep " + str(sleep_time) + " seconds")
openai.api_key = self.get_key(self.key)
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
}
],
)
t_text = (
completion["choices"][0]
.get("message")
.get("content")
.encode("utf8")
.decode()
)
retry = 5
t_text = None
while retry > 0 and not t_text:
openai.api_key = await self.get_key_async(self.key)
try:
completion = await openai.ChatCompletion.acreate(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
# english prompt here to save tokens
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
}
],
)
t_text = (
completion["choices"][0]
.get("message")
.get("content")
.encode("utf8")
.decode()
)
if not NO_LIMIT:
# for time limit
await asyncio.sleep(3)
except Exception as e:
sleep_time = 2 ** (5 - retry)
print(str(e), "will sleep", sleep_time, "seconds")
await asyncio.sleep(sleep_time)
retry -= 1
print(t_text)
return t_text

Expand All @@ -160,44 +154,47 @@ def make_bilingual_book(self):
new_book.metadata = self.origin_book.metadata
new_book.spine = self.origin_book.spine
new_book.toc = self.origin_book.toc
all_items = list(self.origin_book.get_items())
all_items = list(
self.origin_book.get_items()
) # item can be a chapter or a full page of text
# we just translate tag p
all_p_length = 0
for i in all_items:
if i.file_name.endswith(".xhtml"):
all_p_length += len(bs(i.content, "html.parser").findAll("p"))
for item in all_items:
if item.file_name.endswith(".xhtml"):
all_p_length += len(bs(item.content, "html.parser").findAll("p"))
else:
all_p_length += len(bs(i.content, "xml").findAll("p"))
all_p_length += len(bs(item.content, "xml").findAll("p"))
if IS_TEST:
pbar = tqdm(total=TEST_NUM)
else:
pbar = tqdm(total=all_p_length)
index = 0
index = 0 # current iterator of the paragraph to translate
p_to_save_len = len(self.p_to_save)
try:
for i in self.origin_book.get_items():
for item in self.origin_book.get_items():
pbar.update(index)
if i.get_type() == 9:
soup = bs(i.content, "html.parser")
# stop if index reached TEST_NUM in the test mode
if IS_TEST and index >= TEST_NUM:
break
if item.get_type() == 9:
soup = bs(item.content, "html.parser")
p_list = soup.findAll("p")
is_test_done = IS_TEST and index > TEST_NUM
for p in p_list:
if is_test_done or not p.text or self._is_special_text(p.text):
continue
new_p = copy(p)
# TODO banch of p to translate then combine
# PR welcome here
if self.resume and index < p_to_save_len:
new_p.string = self.p_to_save[index]
p_batches = self.create_batches(p_list, BATCH_SIZE)
for p_batch in p_batches:
if self.resume and index + len(p_batch) < p_to_save_len:
# read cached p_list from cache file
p_results = self.p_to_save[index : index + len(p_batch)]
else:
new_p.string = self.translate_model.translate(p.text)
self.p_to_save.append(new_p.text)
p.insert_after(new_p)
index += 1
if IS_TEST and index > TEST_NUM:
break
i.content = soup.prettify().encode()
new_book.add_item(i)
# p_results is a list of modified p in order
p_results = asyncio.run(self.batch_process(p_batch))
# save p_results to cache file
# TODO check p_results
self.p_to_save.extend(p_results)
index += len(p_batch) # update index for pbar
print(f"processed {len(p_results)} paragraphs in batch")
item.content = soup.prettify().encode()

new_book.add_item(item)
name = self.epub_name.split(".")[0]
epub.write_epub(f"{name}_bilingual.epub", new_book, {})
pbar.close()
Expand All @@ -221,6 +218,23 @@ def save_progress(self):
except:
raise Exception("can not save resume file")

def create_batches(self, p_list, batch_size):
return [p_list[i : i + batch_size] for i in range(0, len(p_list), batch_size)]

async def batch_process(self, p_batch):
tasks = [self.process(p) for p in p_batch]
p_results = await asyncio.gather(*tasks)
return p_results

async def process(self, p):
if not p.text or self._is_special_text(p.text):
return p
new_p = copy(p)
new_p.string = await self.translate_model.translate_async(p.text)
# append translated text after the original text
p.insert_after(new_p)
return p


if __name__ == "__main__":
MODEL_DICT = {"gpt3": GPT3, "chatgpt": ChatGPT}
Expand Down Expand Up @@ -249,7 +263,7 @@ def save_progress(self):
"--test",
dest="test",
action="store_true",
help="if test we only translat 10 contents you can easily check",
help="if test we only translate 10 contents you can easily check",
)
parser.add_argument(
"--test_num",
Expand Down Expand Up @@ -296,6 +310,14 @@ def save_progress(self):
type=str,
help="replace base url from openapi",
)
parser.add_argument(
"-b",
"--batch_size",
dest="batch_size",
type=int,
default=1,
help="number of paragraph(s) to translate per batch (it will override --no_limit to true if batch_size > 1)",
)

options = parser.parse_args()
NO_LIMIT = options.no_limit
Expand All @@ -308,6 +330,9 @@ def save_progress(self):

OPENAI_API_KEY = options.openai_key or env.get("OPENAI_API_KEY")
RESUME = options.resume
BATCH_SIZE = options.batch_size
if BATCH_SIZE > 1:
NO_LIMIT = True
if not OPENAI_API_KEY:
raise Exception("Need openai API key, please google how to")
if not options.book_name.endswith(".epub"):
Expand Down