chenkun 2 tygodni temu
rodzic
commit
34e896de51

+ 10 - 9
src/app/services/milvus_service.py

@@ -77,7 +77,8 @@ class MilvusService:
                 
                 parent_docs = []
                 child_docs = []
-                global_idx = 0
+                parent_global_idx = 0
+                child_global_idx = 0
                 
                 for split in md_header_splits:
                     # 获取预处理后的层级路径
@@ -86,21 +87,21 @@ class MilvusService:
                     # 对每个标题块进行父段切分
                     split_parent_chunks = parent_splitter.split_text(split.page_content)
                     
-                    for p_idx, p_content in enumerate(split_parent_chunks):
-                        p_id = hashlib.sha1(f"{doc_id}_p_{global_idx}_{p_idx}".encode()).hexdigest()
-                        p_metadata = self._prepare_metadata(doc_info, p_id, global_idx, p_id, hierarchy)
+                    for p_content in split_parent_chunks:
+                        p_id = hashlib.sha1(f"{doc_id}_p_{parent_global_idx}".encode()).hexdigest()
+                        p_metadata = self._prepare_metadata(doc_info, p_id, parent_global_idx, p_id, hierarchy)
                         parent_docs.append(Document(page_content=p_content, metadata=p_metadata))
+                        parent_global_idx += 1
                         
                         # 2. 在每个父段内部切分子段 (较小块)
                         child_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
                         child_chunks = child_splitter.split_text(p_content)
                         
-                        for c_idx, c_content in enumerate(child_chunks):
-                            c_id = hashlib.sha1(f"{doc_id}_c_{p_id}_{c_idx}".encode()).hexdigest()
-                            c_metadata = self._prepare_metadata(doc_info, c_id, c_idx, p_id, hierarchy)
+                        for c_content in child_chunks:
+                            c_id = hashlib.sha1(f"{doc_id}_c_{child_global_idx}".encode()).hexdigest()
+                            c_metadata = self._prepare_metadata(doc_info, c_id, child_global_idx, p_id, hierarchy)
                             child_docs.append(Document(page_content=c_content, metadata=c_metadata))
-                        
-                        global_idx += 1
+                            child_global_idx += 1
 
                 # 确保两个集合都存在
                 self.ensure_collection_exists(parent_col)

+ 10 - 5
src/app/services/sample_service.py

@@ -1283,12 +1283,14 @@ class SampleService:
                 # 关联主表字段:file_url, conversion_status, md_url, json_url
                 fields = """
                     s.id, s.chinese_name as title, s.standard_number as standard_no, 
-                    s.issuing_authority, s.release_date, s.document_type, 
-                    s.professional_field, s.validity, s.note, 
+                    s.issuing_authority, s.release_date, s.implementation_date,
+                    s.drafting_unit, s.approving_department,
+                    s.document_type, s.professional_field, s.validity, s.note, 
                     s.participating_units, s.reference_basis,
                     s.created_by, u1.username as creator_name, s.created_time,
                     s.updated_by, u2.username as updater_name, s.updated_time,
-                    m.file_url, m.conversion_status, m.md_url, m.json_url, m.kb_id, m.whether_to_enter
+                    m.file_url, m.conversion_status, m.md_url, m.json_url, m.kb_id, m.whether_to_enter,
+                    m.source_type
                 """
                 field_map = {
                     'title': 's.chinese_name',
@@ -1305,6 +1307,7 @@ class SampleService:
                     s.id, s.plan_name as title, NULL as standard_no, 
                     s.project_name, s.project_section,
                     s.compiling_unit as issuing_authority, s.compiling_date as release_date, 
+                    NULL as implementation_date, NULL as drafting_unit, NULL as approving_department,
                     NULL as document_type, NULL as professional_field, NULL as validity, 
                     s.plan_summary, s.compilation_basis,
                     s.plan_category, s.level_1_classification, s.level_2_classification,
@@ -1312,7 +1315,8 @@ class SampleService:
                     s.note, 
                     s.created_by, u1.username as creator_name, s.created_time,
                     s.updated_by, u2.username as updater_name, s.updated_time,
-                    m.file_url, m.conversion_status, m.md_url, m.json_url, m.kb_id, m.whether_to_enter
+                    m.file_url, m.conversion_status, m.md_url, m.json_url, m.kb_id, m.whether_to_enter,
+                    m.source_type
                 """
                 field_map = {
                     'title': 's.plan_name',
@@ -1333,7 +1337,8 @@ class SampleService:
                     s.note, 
                     s.created_by, u1.username as creator_name, s.created_time,
                     s.updated_by, u2.username as updater_name, s.updated_time,
-                    m.file_url, m.conversion_status, m.md_url, m.json_url, m.kb_id, m.whether_to_enter
+                    m.file_url, m.conversion_status, m.md_url, m.json_url, m.kb_id, m.whether_to_enter,
+                    m.source_type
                 """
                 field_map = {
                     'title': 's.file_name',