Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions apps/common/config/tokenizer_manage_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,27 @@
@date:2024/4/28 10:17
@desc:
"""
import os
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent.parent


class MKTokenizer:
def __init__(self, tokenizer):
self.tokenizer = tokenizer

def encode(self, text):
return self.tokenizer.encode(text).ids


class TokenizerManage:
tokenizer = None

@staticmethod
def get_tokenizer():
from transformers import BertTokenizer
if TokenizerManage.tokenizer is None:
TokenizerManage.tokenizer = BertTokenizer.from_pretrained(
'bert-base-cased',
cache_dir="/opt/maxkb-app/model/tokenizer",
local_files_only=True,
resume_download=False,
force_download=False)
return TokenizerManage.tokenizer
from tokenizers import Tokenizer
# 创建Tokenizer
s = os.path.join(BASE_DIR.parent, 'tokenizer', 'bert-base-cased', 'tokenizer.json')
TokenizerManage.tokenizer = Tokenizer.from_file(s)
return MKTokenizer(TokenizerManage.tokenizer)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code Review and Suggestions

  1. Imports:

    • Ensure all necessary imports are at the top for clarity.
  2. Base Directory:

    • The use of os.path.join along with Path() simplifies path handling and ensures cross-platform compatibility.
  3. MKTokenizer Class:

    • The tokenize method can be simplified to just returning tokenizer.encode(text) without wrapping it inside another list.
  4. TokenizerManage Class:

    • Simplify the creation process by directly loading the tokenizer's parameters into Tokenizer.
    • Create an instance of MKTokenizer using the loaded tokenizer within the get_tokenizer method.
    • Remove unnecessary comments that may clutter the code.
  5. File Paths:

    • Ensure that paths relative to __file__ are correctly calculated. The current absolute path (BASE_DIR) should point to the directory containing this file, not its parent or grandparent directories twice.
  6. Static Method Considerations:

    • Since the tokenizer might change during runtime (e.g., reloading based on user input or configuration), consider making this non-static or moving to a different pattern if changes are expected frequently.

Revised Code:

@@ -6,15 +6,21 @@
 @date2024/4/28 10:17
 @desc:
 """

+import os
+from pathlib import Path
+
+BASE_DIR = Path(__file__).resolve().parent
+
+
+class MKTokenizer:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
        
+    def encode(self, text):
+        return self.tokenizer.encode(text)


class TokenizerManage:
    tokenizer = None
    
    @staticmethod
    def get_tokenizer():
+        from transformers import BertTokenizer  
        # Load tokenizer params from file
-        path = "/opt/maxkb-app/model/tokenizer/bert-base-cased"
+        s = os.path.join(BASE_DIR.parent, 'tokenizer', 'bert-base-cased', 'tokenizer.json')
        
        TokenizerManage.tokenizer = BertTokenizer.from_file(s)
        return MKTokenizer(TokenizerManage.tokenizer)

By applying these improvements, the code will be cleaner, more maintainable, and adhere more closely to best practices in Python programming paradigms.

Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
@date:2024/4/18 15:28
@desc:
"""
from typing import List, Dict

from langchain_core.messages import BaseMessage, get_buffer_string
from typing import Dict

from common.config.tokenizer_manage_config import TokenizerManage
from models_provider.base_model_provider import MaxKBBaseModel
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The provided Python code has a minor issue that you may want to address:

  1. Redundant Import: You're importing List, which is imported by default if using typing. Therefore, the import of from typing import List can be removed.

Here's an updated version with the redundant import removed:

@@ -6,12 +6,7 @@
     @date2024/4/18 15:28
     @desc:
 """
-import typing
-from typing import List, Dict

from common.config.tokenizer_manage_config import TokenizerManage
from models_provider.base_model_provider import MaxKBBaseModel

# Remove the redundant import of 'List'

This change makes your code cleaner and eliminates unnecessary redundancy. If there are no other issues, this will suffice.

Expand Down
23 changes: 23 additions & 0 deletions tokenizer/bert-base-cased/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.6.0.dev0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 28996
}
Empty file.
1 change: 1 addition & 0 deletions tokenizer/bert-base-cased/tokenizer.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tokenizer/bert-base-cased/tokenizer_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"do_lower_case": false, "model_max_length": 512}
Loading
Loading