ts_vecto_util.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # coding=utf-8
  2. """
  3. @project: maxkb
  4. @Author:虎
  5. @file: ts_vecto_util.py
  6. @date:2024/4/16 15:26
  7. @desc:
  8. """
  9. import re
  10. import uuid_utils.compat as uuid
  11. from typing import List
  12. import jieba
  13. import jieba.posseg
  14. jieba_word_list_cache = [chr(item) for item in range(38, 84)]
  15. for jieba_word in jieba_word_list_cache:
  16. jieba.add_word('#' + jieba_word + '#')
  17. # r"(?i)\b(?:https?|ftp|tcp|file)://[^\s]+\b",
  18. # 某些不分词数据
  19. # r'"([^"]*)"'
  20. word_pattern_list = [r"v\d+.\d+.\d+",
  21. r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"]
  22. remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./'
  23. jieba_remove_flag_list = ['x', 'w']
  24. def get_word_list(text: str):
  25. result = []
  26. for pattern in word_pattern_list:
  27. word_list = re.findall(pattern, text)
  28. for child_list in word_list:
  29. for word in child_list if isinstance(child_list, tuple) else [child_list]:
  30. # 不能有: 所以再使用: 进行分割
  31. if word.__contains__(':'):
  32. item_list = word.split(":")
  33. for w in item_list:
  34. result.append(w)
  35. else:
  36. result.append(word)
  37. return result
  38. def replace_word(word_dict, text: str):
  39. for key in word_dict:
  40. pattern = '(?<!#)' + re.escape(word_dict[key]) + '(?!#)'
  41. text = re.sub(pattern, key, text)
  42. return text
  43. def get_word_key(text: str, use_word_list):
  44. j_word = next((j for j in jieba_word_list_cache if j not in text and all(j not in used for used in use_word_list)),
  45. None)
  46. if j_word:
  47. return j_word
  48. j_word = str(uuid.uuid7())
  49. jieba.add_word(j_word)
  50. return j_word
  51. def to_word_dict(word_list: List, text: str):
  52. word_dict = {}
  53. for word in word_list:
  54. key = get_word_key(text, set(word_dict))
  55. word_dict['#' + key + '#'] = word
  56. return word_dict
  57. def get_key_by_word_dict(key, word_dict):
  58. v = word_dict.get(key)
  59. if v is None:
  60. return key
  61. return v
  62. def to_ts_vector(text: str):
  63. # 分词
  64. result = jieba.lcut(text, cut_all=True)
  65. return " ".join(result)
  66. def to_query(text: str):
  67. extract_tags = jieba.lcut(text, cut_all=True)
  68. result = " ".join(extract_tags)
  69. return result