| 123456789101112131415161718192021222324252627282930 |
- # coding=utf-8
- """
- @project: maxkb
- @Author:虎
- @file: tokenizer_manage_config.py
- @date:2024/4/28 10:17
- @desc:
- """
- import os
- class MKTokenizer:
- def __init__(self, tokenizer):
- self.tokenizer = tokenizer
- def encode(self, text):
- return self.tokenizer.encode(text).ids
- class TokenizerManage:
- tokenizer = None
- @staticmethod
- def get_tokenizer():
- from tokenizers import Tokenizer
- # 创建Tokenizer
- model_path = os.path.join("/opt/maxkb-app", "model", "tokenizer", "models--bert-base-cased")
- with open(f"{model_path}/refs/main", encoding="utf-8") as f: snapshot = f.read()
- TokenizerManage.tokenizer = Tokenizer.from_file(f"{model_path}/snapshots/{snapshot}/tokenizer.json")
- return MKTokenizer(TokenizerManage.tokenizer)
|