|
@@ -1202,7 +1202,7 @@ class ContentClassifierClient:
|
|
|
# 验证分类代码是否在有效列表中
|
|
# 验证分类代码是否在有效列表中
|
|
|
valid_codes = set(v[1] for v in index_mapping.values())
|
|
valid_codes = set(v[1] for v in index_mapping.values())
|
|
|
if category_code not in valid_codes:
|
|
if category_code not in valid_codes:
|
|
|
- print(f" 警告: 发现非标准分类 '{category_name}' ({category_code}),强制归为非标准项")
|
|
|
|
|
|
|
+ logger.warning(f"发现非标准分类 '{category_name}' ({category_code}),强制归为非标准项")
|
|
|
category_code = "no_standard"
|
|
category_code = "no_standard"
|
|
|
category_name = "非标准项"
|
|
category_name = "非标准项"
|
|
|
|
|
|
|
@@ -1251,7 +1251,7 @@ class ContentClassifierClient:
|
|
|
category_name = item.get("third_category_name", "")
|
|
category_name = item.get("third_category_name", "")
|
|
|
valid_codes = set(v[1] for v in index_mapping.values())
|
|
valid_codes = set(v[1] for v in index_mapping.values())
|
|
|
if category_code not in valid_codes:
|
|
if category_code not in valid_codes:
|
|
|
- print(f" 警告: 发现非标准分类 '{category_name}' ({category_code}),强制归为非标准项")
|
|
|
|
|
|
|
+ logger.warning(f"发现非标准分类 '{category_name}' ({category_code}),强制归为非标准项")
|
|
|
category_code = "no_standard"
|
|
category_code = "no_standard"
|
|
|
category_name = "非标准项"
|
|
category_name = "非标准项"
|
|
|
|
|
|
|
@@ -1268,10 +1268,9 @@ class ContentClassifierClient:
|
|
|
contents = self._merge_classified_contents(contents, section)
|
|
contents = self._merge_classified_contents(contents, section)
|
|
|
return contents, True # 解析成功(可能为空结果)
|
|
return contents, True # 解析成功(可能为空结果)
|
|
|
except Exception as e2:
|
|
except Exception as e2:
|
|
|
- error_msg = f"解析JSON失败: {e}, 二次修复也失败: {e2}"
|
|
|
|
|
- print(error_msg)
|
|
|
|
|
- print(f"原始响应前500字符: {response[:500]}...")
|
|
|
|
|
- print(f"提取的JSON前300字符: {json_str[:300]}...")
|
|
|
|
|
|
|
+ logger.error(f"解析JSON失败: {e}, 二次修复也失败: {e2}")
|
|
|
|
|
+ logger.debug(f"原始响应前500字符: {response[:500]}...")
|
|
|
|
|
+ logger.debug(f"提取的JSON前300字符: {json_str[:300]}...")
|
|
|
return [], False # 解析失败
|
|
return [], False # 解析失败
|
|
|
|
|
|
|
|
def _merge_classified_contents(self, contents: List[ClassifiedContent], section: SectionContent) -> List[ClassifiedContent]:
|
|
def _merge_classified_contents(self, contents: List[ClassifiedContent], section: SectionContent) -> List[ClassifiedContent]:
|
|
@@ -1786,14 +1785,14 @@ class LLMContentClassifier:
|
|
|
- tertiary_category_cn: 三级分类名称
|
|
- tertiary_category_cn: 三级分类名称
|
|
|
- tertiary_classification_details: 行级分类详情列表
|
|
- tertiary_classification_details: 行级分类详情列表
|
|
|
"""
|
|
"""
|
|
|
- print(f"\n正在对 {len(chunks)} 个内容块进行三级分类...")
|
|
|
|
|
|
|
+ logger.info(f"正在对 {len(chunks)} 个内容块进行三级分类...")
|
|
|
|
|
|
|
|
# 步骤1: 将 chunks 转换为 SectionContent 列表
|
|
# 步骤1: 将 chunks 转换为 SectionContent 列表
|
|
|
sections = self.converter.chunks_to_sections(chunks)
|
|
sections = self.converter.chunks_to_sections(chunks)
|
|
|
- print(f" 按二级标题分组后得到 {len(sections)} 个段落")
|
|
|
|
|
|
|
+ logger.info(f"按二级标题分组后得到 {len(sections)} 个段落")
|
|
|
|
|
|
|
|
if not sections:
|
|
if not sections:
|
|
|
- print(" 没有有效的段落需要分类")
|
|
|
|
|
|
|
+ logger.info("没有有效的段落需要分类")
|
|
|
return chunks
|
|
return chunks
|
|
|
|
|
|
|
|
# 步骤2: 创建分类客户端
|
|
# 步骤2: 创建分类客户端
|
|
@@ -1812,10 +1811,12 @@ class LLMContentClassifier:
|
|
|
results_map[section.section_key] = result
|
|
results_map[section.section_key] = result
|
|
|
|
|
|
|
|
if progress_callback:
|
|
if progress_callback:
|
|
|
- progress_callback(idx + 1, total)
|
|
|
|
|
|
|
+ ret = progress_callback(idx + 1, total, section.section_name, not result.error)
|
|
|
|
|
+ if asyncio.iscoroutine(ret):
|
|
|
|
|
+ await ret
|
|
|
else:
|
|
else:
|
|
|
status = "成功" if not result.error else f"失败: {result.error[:30]}"
|
|
status = "成功" if not result.error else f"失败: {result.error[:30]}"
|
|
|
- print(f" [{idx + 1}/{total}] {section.section_name}: {status}")
|
|
|
|
|
|
|
+ logger.debug(f"[{idx + 1}/{total}] {section.section_name}: {status}")
|
|
|
|
|
|
|
|
return result
|
|
return result
|
|
|
|
|
|
|
@@ -1890,7 +1891,7 @@ class LLMContentClassifier:
|
|
|
|
|
|
|
|
updated_chunks.append(updated_chunk)
|
|
updated_chunks.append(updated_chunk)
|
|
|
|
|
|
|
|
- print(f" 三级分类完成!共处理 {len(updated_chunks)} 个 chunks")
|
|
|
|
|
|
|
+ logger.info(f"三级分类完成!共处理 {len(updated_chunks)} 个 chunks")
|
|
|
return updated_chunks
|
|
return updated_chunks
|
|
|
|
|
|
|
|
|
|
|