tokenizer_manage_config.py 827 B

123456789101112131415161718192021222324252627282930
  1. # coding=utf-8
  2. """
  3. @project: maxkb
  4. @Author:虎
  5. @file: tokenizer_manage_config.py
  6. @date:2024/4/28 10:17
  7. @desc:
  8. """
  9. import os
  10. class MKTokenizer:
  11. def __init__(self, tokenizer):
  12. self.tokenizer = tokenizer
  13. def encode(self, text):
  14. return self.tokenizer.encode(text).ids
  15. class TokenizerManage:
  16. tokenizer = None
  17. @staticmethod
  18. def get_tokenizer():
  19. from tokenizers import Tokenizer
  20. # 创建Tokenizer
  21. model_path = os.path.join("/opt/maxkb-app", "model", "tokenizer", "models--bert-base-cased")
  22. with open(f"{model_path}/refs/main", encoding="utf-8") as f: snapshot = f.read()
  23. TokenizerManage.tokenizer = Tokenizer.from_file(f"{model_path}/snapshots/{snapshot}/tokenizer.json")
  24. return MKTokenizer(TokenizerManage.tokenizer)