yihong0618 · tianshanghong · Mar 5, 2023 · Mar 5, 2023 · Mar 6, 2023 · Mar 6, 2023
diff --git a/make_book.py b/make_book.py
@@ -6,6 +6,7 @@
 from copy import copy
 from os import environ as env
 from pathlib import Path
+import asyncio
 
 import openai
 import requests
@@ -26,13 +27,21 @@ def __init__(self, key, language, api_base=None):
         self.key = key
         self.language = language
         self.current_key_index = 0
+        self.key_lock = asyncio.Lock()
 
     def get_key(self, key_str):
         keys = key_str.split(",")
         key = keys[self.current_key_index]
         self.current_key_index = (self.current_key_index + 1) % len(keys)
         return key
 
+    async def get_key_async(self, key_str):
+        async with self.key_lock:
+            keys = key_str.split(",")
+            key = keys[self.current_key_index]
+            self.current_key_index = (self.current_key_index + 1) % len(keys)
+        return key
+
     @abstractmethod
     def translate(self, text):
         pass
@@ -88,53 +97,38 @@ def __init__(self, key, language, api_base=None):
         if api_base:
             openai.api_base = api_base
 
-    def translate(self, text):
+    async def translate_async(self, text):
         print(text)
-        openai.api_key = self.get_key(self.key)
-        try:
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "user",
-                        # english prompt here to save tokens
-                        "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
-                    }
-                ],
-            )
-            t_text = (
-                completion["choices"][0]
-                .get("message")
-                .get("content")
-                .encode("utf8")
-                .decode()
-            )
-            if not NO_LIMIT:
-                # for time limit
-                time.sleep(3)
-        except Exception as e:
-            # TIME LIMIT for open api please pay
-            key_len = self.key.count(",") + 1
-            sleep_time = int(60 / key_len)
-            time.sleep(sleep_time)
-            print(str(e), "will sleep  " + str(sleep_time) + " seconds")
-            openai.api_key = self.get_key(self.key)
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "user",
-                        "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
-                    }
-                ],
-            )
-            t_text = (
-                completion["choices"][0]
-                .get("message")
-                .get("content")
-                .encode("utf8")
-                .decode()
-            )
+        retry = 5
+        t_text = None
+        while retry > 0 and not t_text:
+            openai.api_key = await self.get_key_async(self.key)
+            try:
+                completion = await openai.ChatCompletion.acreate(
+                    model="gpt-3.5-turbo",
+                    messages=[
+                        {
+                            "role": "user",
+                            # english prompt here to save tokens
+                            "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
+                        }
+                    ],
+                )
+                t_text = (
+                    completion["choices"][0]
+                    .get("message")
+                    .get("content")
+                    .encode("utf8")
+                    .decode()
+                )
+                if not NO_LIMIT:
+                    # for time limit
+                    await asyncio.sleep(3)
+            except Exception as e:
+                sleep_time = 2 ** (5 - retry)
+                print(str(e), "will sleep", sleep_time, "seconds")
+                await asyncio.sleep(sleep_time)
+                retry -= 1
         print(t_text)
         return t_text
 
@@ -160,44 +154,47 @@ def make_bilingual_book(self):
         new_book.metadata = self.origin_book.metadata
         new_book.spine = self.origin_book.spine
         new_book.toc = self.origin_book.toc
-        all_items = list(self.origin_book.get_items())
+        all_items = list(
+            self.origin_book.get_items()
+        )  # item can be a chapter or a full page of text
         # we just translate tag p
         all_p_length = 0
-        for i in all_items:
-            if i.file_name.endswith(".xhtml"):
-                all_p_length += len(bs(i.content, "html.parser").findAll("p"))
+        for item in all_items:
+            if item.file_name.endswith(".xhtml"):
+                all_p_length += len(bs(item.content, "html.parser").findAll("p"))
             else:
-                all_p_length += len(bs(i.content, "xml").findAll("p"))
+                all_p_length += len(bs(item.content, "xml").findAll("p"))
         if IS_TEST:
             pbar = tqdm(total=TEST_NUM)
         else:
             pbar = tqdm(total=all_p_length)
-        index = 0
+        index = 0  # current iterator of the paragraph to translate
         p_to_save_len = len(self.p_to_save)
         try:
-            for i in self.origin_book.get_items():
+            for item in self.origin_book.get_items():
                 pbar.update(index)
-                if i.get_type() == 9:
-                    soup = bs(i.content, "html.parser")
+                # stop if index reached TEST_NUM in the test mode
+                if IS_TEST and index >= TEST_NUM:
+                    break
+                if item.get_type() == 9:
+                    soup = bs(item.content, "html.parser")
                     p_list = soup.findAll("p")
-                    is_test_done = IS_TEST and index > TEST_NUM
-                    for p in p_list:
-                        if is_test_done or not p.text or self._is_special_text(p.text):
-                            continue
-                        new_p = copy(p)
-                        # TODO banch of p to translate then combine
-                        # PR welcome here
-                        if self.resume and index < p_to_save_len:
-                            new_p.string = self.p_to_save[index]
+                    p_batches = self.create_batches(p_list, BATCH_SIZE)
+                    for p_batch in p_batches:
+                        if self.resume and index + len(p_batch) < p_to_save_len:
+                            # read cached p_list from cache file
+                            p_results = self.p_to_save[index : index + len(p_batch)]
                         else:
-                            new_p.string = self.translate_model.translate(p.text)
-                            self.p_to_save.append(new_p.text)
-                        p.insert_after(new_p)
-                        index += 1
-                        if IS_TEST and index > TEST_NUM:
-                            break
-                    i.content = soup.prettify().encode()
-                new_book.add_item(i)
+                            # p_results is a list of modified p in order
+                            p_results = asyncio.run(self.batch_process(p_batch))
+                            # save p_results to cache file
+                            # TODO check p_results
+                            self.p_to_save.extend(p_results)
+                        index += len(p_batch)  # update index for pbar
+                        print(f"processed {len(p_results)} paragraphs in batch")
+                    item.content = soup.prettify().encode()
+
+                new_book.add_item(item)
             name = self.epub_name.split(".")[0]
             epub.write_epub(f"{name}_bilingual.epub", new_book, {})
             pbar.close()
@@ -221,6 +218,23 @@ def save_progress(self):
         except:
             raise Exception("can not save resume file")
 
+    def create_batches(self, p_list, batch_size):
+        return [p_list[i : i + batch_size] for i in range(0, len(p_list), batch_size)]
+
+    async def batch_process(self, p_batch):
+        tasks = [self.process(p) for p in p_batch]
+        p_results = await asyncio.gather(*tasks)
+        return p_results
+
+    async def process(self, p):
+        if not p.text or self._is_special_text(p.text):
+            return p
+        new_p = copy(p)
+        new_p.string = await self.translate_model.translate_async(p.text)
+        # append translated text after the original text
+        p.insert_after(new_p)
+        return p
+
 
 if __name__ == "__main__":
     MODEL_DICT = {"gpt3": GPT3, "chatgpt": ChatGPT}
@@ -249,7 +263,7 @@ def save_progress(self):
         "--test",
         dest="test",
         action="store_true",
-        help="if test we only translat 10 contents you can easily check",
+        help="if test we only translate 10 contents you can easily check",
     )
     parser.add_argument(
         "--test_num",
@@ -296,6 +310,14 @@ def save_progress(self):
         type=str,
         help="replace base url from openapi",
     )
+    parser.add_argument(
+        "-b",
+        "--batch_size",
+        dest="batch_size",
+        type=int,
+        default=1,
+        help="number of paragraph(s) to translate per batch (it will override --no_limit to true if batch_size > 1)",
+    )
 
     options = parser.parse_args()
     NO_LIMIT = options.no_limit
@@ -308,6 +330,9 @@ def save_progress(self):
 
     OPENAI_API_KEY = options.openai_key or env.get("OPENAI_API_KEY")
     RESUME = options.resume
+    BATCH_SIZE = options.batch_size
+    if BATCH_SIZE > 1:
+        NO_LIMIT = True
     if not OPENAI_API_KEY:
         raise Exception("Need openai API key, please google how to")
     if not options.book_name.endswith(".epub"):