chenkun 1 месяц назад
Родитель
Сommit
93f9312efe
3 измененных файлов с 755 добавлено и 184 удалено
  1. 680 154
      full_server.py
  2. 3 1
      scripts/fix_db_indexes.py
  3. 72 29
      scripts/miner_u.py

+ 680 - 154
full_server.py

@@ -15,13 +15,53 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
 from dotenv import load_dotenv
 from dotenv import load_dotenv
 load_dotenv()
 load_dotenv()
 
 
-from fastapi import FastAPI, HTTPException, Depends, Request, Response
+from fastapi import FastAPI, HTTPException, Depends, Request, Response, BackgroundTasks
+from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel
 from pydantic import BaseModel
 from typing import Optional, Any, Union
 from typing import Optional, Any, Union
 import hashlib
 import hashlib
 import secrets
 import secrets
+import requests
+from urllib.parse import urlparse
+
+# MIME 类型到后缀的映射
+MIME_MAP = {
+    'application/pdf': '.pdf',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+    'application/msword': '.doc',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
+    'application/vnd.ms-excel': '.xls',
+    'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
+    'application/vnd.ms-powerpoint': '.ppt',
+    'text/markdown': '.md',
+    'text/plain': '.txt',
+    'text/html': '.html',
+    'image/jpeg': '.jpg',
+    'image/png': '.png',
+    'application/zip': '.zip',
+}
+
+def detect_file_extension(url: str) -> str:
+    """通过 URL 路径或 HEAD 请求检测文件后缀"""
+    if not url:
+        return ""
+    
+    # 1. 尝试从路径解析
+    path = urlparse(url).path
+    ext = os.path.splitext(path)[1].lower()
+    if ext and len(ext) <= 6:
+        return ext
+    
+    # 2. 尝试 HEAD 请求检测 Content-Type
+    try:
+        response = requests.head(url, allow_redirects=True, timeout=5)
+        content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
+        return MIME_MAP.get(content_type, "")
+    except Exception as e:
+        print(f"检测文件后缀失败: {e}")
+        return ""
 # 修复JWT导入 - 确保使用正确的JWT库
 # 修复JWT导入 - 确保使用正确的JWT库
 try:
 try:
     # 首先尝试使用PyJWT
     # 首先尝试使用PyJWT
@@ -95,6 +135,75 @@ TABLE_MAP = {
     "job": "t_job_of_preparation"      # 办公制度
     "job": "t_job_of_preparation"      # 办公制度
 }
 }
 
 
+def get_db_connection():
+    """获取数据库连接"""
+    try:
+        database_url = os.getenv('DATABASE_URL', '')
+        if not database_url:
+            return None
+            
+        parsed = urlparse(database_url)
+        config = {
+            'host': parsed.hostname or 'localhost',
+            'port': parsed.port or 3306,
+            'user': parsed.username or 'root',
+            'password': parsed.password or '',
+            'database': parsed.path[1:] if parsed.path else 'sso_db',
+            'charset': 'utf8mb4'
+        }
+        
+        return pymysql.connect(**config)
+    except Exception as e:
+        print(f"数据库连接失败: {e}")
+        return None
+
+# --- 初始化主表 ---
+def init_master_table():
+    """初始化主表结构,并确保所有必要字段都存在"""
+    conn = get_db_connection()
+    if not conn:
+        return
+    try:
+        cursor = conn.cursor()
+        # 1. 创建主表 (如果不存在)
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS t_document_main (
+                id CHAR(36) PRIMARY KEY,
+                title VARCHAR(255) NOT NULL,
+                standard_no VARCHAR(100),
+                issuing_authority VARCHAR(255),
+                release_date DATE,
+                document_type VARCHAR(100),
+                professional_field VARCHAR(100),
+                validity VARCHAR(50) DEFAULT '现行',
+                created_by VARCHAR(100),
+                created_time DATETIME DEFAULT CURRENT_TIMESTAMP,
+                updated_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+                conversion_status TINYINT DEFAULT 0, -- 0:待转化, 1:转化中, 2:已完成, 3:失败
+                conversion_progress INT DEFAULT 0,
+                converted_file_name VARCHAR(255),
+                conversion_error TEXT,
+                whether_to_enter TINYINT DEFAULT 0, -- 0:未入库, 1:已入库
+                source_type ENUM('basis', 'work', 'job') NOT NULL,
+                source_id CHAR(36) NOT NULL,
+                file_url TEXT,
+                file_extension VARCHAR(10),
+                content TEXT,
+                primary_category_id INT,
+                secondary_category_id INT,
+                year INT
+            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+        """)
+        conn.commit()
+        print("✅ 主表 t_document_main 初始化成功")
+    except Exception as e:
+        print(f"❌ 初始化主表失败: {e}")
+    finally:
+        conn.close()
+
+# 执行初始化
+init_master_table()
+
 def get_table_name(table_type: Optional[str]) -> str:
 def get_table_name(table_type: Optional[str]) -> str:
     """根据类型获取对应的数据库表名,默认为编制依据"""
     """根据类型获取对应的数据库表名,默认为编制依据"""
     return TABLE_MAP.get(table_type, "t_basis_of_preparation")
     return TABLE_MAP.get(table_type, "t_basis_of_preparation")
@@ -105,14 +214,29 @@ class DocumentAdd(BaseModel):
     primary_category_id: Optional[Any] = None
     primary_category_id: Optional[Any] = None
     secondary_category_id: Optional[Any] = None
     secondary_category_id: Optional[Any] = None
     year: Optional[int] = None
     year: Optional[int] = None
-    table_type: Optional[str] = "basis" # 增加表类型参数
+    table_type: Optional[str] = "basis"
+    # 新增编辑需要的字段
+    id: Optional[str] = None
+    source_id: Optional[str] = None
+    # 扩展字段 (子表特有属性)
+    standard_no: Optional[str] = None
+    issuing_authority: Optional[str] = None
+    release_date: Optional[str] = None
+    document_type: Optional[str] = None
+    professional_field: Optional[str] = None
+    validity: Optional[str] = None
+    project_name: Optional[str] = None
+    project_section: Optional[str] = None
+    # 文件相关字段
+    file_url: Optional[str] = None
+    file_extension: Optional[str] = None
 
 
 class DocumentListRequest(BaseModel):
 class DocumentListRequest(BaseModel):
-    primaryCategoryId: Optional[int] = None
-    secondaryCategoryId: Optional[int] = None
     page: int = 1
     page: int = 1
     size: int = 50
     size: int = 50
-    sort_by: str = "created_at"  # created_at or updated_at
+    keyword: Optional[str] = None
+    table_type: Optional[str] = None
+    whether_to_enter: Optional[int] = None
 
 
 # 配置
 # 配置
 JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "dev-jwt-secret-key-12345")
 JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "dev-jwt-secret-key-12345")
@@ -134,28 +258,6 @@ def find_available_port(start_port=8000, max_port=8010):
             return port
             return port
     return None
     return None
 
 
-def get_db_connection():
-    """获取数据库连接"""
-    try:
-        database_url = os.getenv('DATABASE_URL', '')
-        if not database_url:
-            return None
-            
-        parsed = urlparse(database_url)
-        config = {
-            'host': parsed.hostname or 'localhost',
-            'port': parsed.port or 3306,
-            'user': parsed.username or 'root',
-            'password': parsed.password or '',
-            'database': parsed.path[1:] if parsed.path else 'sso_db',
-            'charset': 'utf8mb4'
-        }
-        
-        return pymysql.connect(**config)
-    except Exception as e:
-        print(f"数据库连接失败: {e}")
-        return None
-
 def verify_password_simple(password: str, stored_hash: str) -> bool:
 def verify_password_simple(password: str, stored_hash: str) -> bool:
     """验证密码(简化版)"""
     """验证密码(简化版)"""
     if stored_hash.startswith("sha256$"):
     if stored_hash.startswith("sha256$"):
@@ -3553,16 +3655,16 @@ import httpx
 from fastapi.responses import HTMLResponse
 from fastapi.responses import HTMLResponse
 
 
 class BatchEnterRequest(BaseModel):
 class BatchEnterRequest(BaseModel):
-    ids: list[int]
-    table_type: Optional[str] = "basis"
+    ids: list[Union[int, str]]
+    table_type: Optional[str] = None
 
 
 class BatchDeleteRequest(BaseModel):
 class BatchDeleteRequest(BaseModel):
     ids: list[Union[int, str]]
     ids: list[Union[int, str]]
-    table_type: Optional[str] = "basis"
+    table_type: Optional[str] = None
 
 
 class ConvertRequest(BaseModel):
 class ConvertRequest(BaseModel):
     id: Union[int, str]
     id: Union[int, str]
-    table_type: Optional[str] = "basis"
+    table_type: Optional[str] = None
 
 
 # --- 文档管理中心 API ---
 # --- 文档管理中心 API ---
 
 
@@ -3640,24 +3742,53 @@ async def batch_enter_knowledge_base(req: BatchEnterRequest, credentials: HTTPAu
             return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
         
         
         cursor = conn.cursor()
         cursor = conn.cursor()
-        table_name = get_table_name(req.table_type)
-        # 批量更新 whether_to_enter 为 1
-        # 只更新尚未入库的数据 (whether_to_enter = 0)
+        
+        # 1. 批量更新主表 t_document_main
+        # 只更新尚未入库的数据 (whether_to_enter = 0),同时更新修改时间和修改人(如果需要)
         placeholders = ', '.join(['%s'] * len(req.ids))
         placeholders = ', '.join(['%s'] * len(req.ids))
-        sql = f"UPDATE {table_name} SET whether_to_enter = 1, updated_at = NOW() WHERE id IN ({placeholders}) AND whether_to_enter = 0"
-        cursor.execute(sql, req.ids)
-        conn.commit()
+        username = payload.get("username", "admin")
         
         
+        # 首先更新主表
+        sql_main = f"""
+            UPDATE t_document_main 
+            SET whether_to_enter = 1, updated_time = NOW() 
+            WHERE id IN ({placeholders}) AND whether_to_enter = 0
+        """
+        cursor.execute(sql_main, req.ids)
         affected_rows = cursor.rowcount
         affected_rows = cursor.rowcount
+        
+        # 2. 尝试更新对应的子表以保持同步
+        try:
+            # 查询这些 ID 对应的 source_type 和 source_id
+            cursor.execute(f"SELECT id, source_type, source_id FROM t_document_main WHERE id IN ({placeholders})", req.ids)
+            docs = cursor.fetchall()
+            
+            for doc_row in docs:
+                d_id, s_type, s_id = doc_row
+                if s_type and s_id:
+                    sub_table = get_table_name(s_type)
+                    if sub_table:
+                        # 更新子表中的 whether_to_enter 字段(如果存在)
+                        # 注意:子表中的主键可能是 id 且值为 s_id
+                        sub_sql = f"UPDATE {sub_table} SET whether_to_enter = 1, updated_at = NOW(), updated_by = %s WHERE id = %s"
+                        try:
+                            cursor.execute(sub_sql, (username, s_id))
+                        except Exception as sub_e:
+                            print(f"更新子表 {sub_table} 失败 (可能字段不存在): {sub_e}")
+        except Exception as sync_e:
+            print(f"同步更新子表失败: {sync_e}")
+
+        conn.commit()
         cursor.close()
         cursor.close()
         conn.close()
         conn.close()
         
         
         message = f"成功将 {affected_rows} 条数据加入知识库"
         message = f"成功将 {affected_rows} 条数据加入知识库"
         if affected_rows < len(req.ids):
         if affected_rows < len(req.ids):
-            message += f"(跳过了 {len(req.ids) - affected_rows} 条已入库数据)"
+            message += f"(跳过了 {len(req.ids) - affected_rows} 条已入库数据或未找到数据)"
             
             
         return ApiResponse(code=0, message=message, timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
         return ApiResponse(code=0, message=message, timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
     except Exception as e:
     except Exception as e:
+        print(f"批量操作失败: {e}")
         return ApiResponse(code=500, message=f"批量操作失败: {str(e)}", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
         return ApiResponse(code=500, message=f"批量操作失败: {str(e)}", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
 
 
 @app.post("/api/v1/documents/batch-delete")
 @app.post("/api/v1/documents/batch-delete")
@@ -3675,18 +3806,39 @@ async def batch_delete_documents(req: BatchDeleteRequest, credentials: HTTPAutho
             return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
         
         
         cursor = conn.cursor()
         cursor = conn.cursor()
-        table_name = get_table_name(req.table_type)
         
         
         if not req.ids:
         if not req.ids:
             return ApiResponse(code=400, message="未指定要删除的文档 ID", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             return ApiResponse(code=400, message="未指定要删除的文档 ID", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             
             
         placeholders = ', '.join(['%s'] * len(req.ids))
         placeholders = ', '.join(['%s'] * len(req.ids))
-        sql = f"DELETE FROM {table_name} WHERE id IN ({placeholders})"
-        cursor.execute(sql, req.ids)
-        conn.commit()
         
         
+        # 1. 尝试同步删除子表中的数据
+        try:
+            # 查询这些 ID 对应的 source_type 和 source_id
+            cursor.execute(f"SELECT source_type, source_id FROM t_document_main WHERE id IN ({placeholders})", req.ids)
+            docs = cursor.fetchall()
+            
+            for doc_row in docs:
+                s_type, s_id = doc_row
+                if s_type and s_id:
+                    sub_table = get_table_name(s_type)
+                    if sub_table:
+                        # 删除子表数据
+                        sub_sql = f"DELETE FROM {sub_table} WHERE id = %s"
+                        try:
+                            cursor.execute(sub_sql, (s_id,))
+                        except Exception as sub_e:
+                            print(f"删除子表 {sub_table} 数据失败: {sub_e}")
+        except Exception as sync_e:
+            print(f"同步删除子表数据失败: {sync_e}")
+
+        # 2. 删除主表 t_document_main 中的数据
+        sql_main = f"DELETE FROM t_document_main WHERE id IN ({placeholders})"
+        cursor.execute(sql_main, req.ids)
         affected_rows = cursor.rowcount
         affected_rows = cursor.rowcount
         
         
+        conn.commit()
+        
         return ApiResponse(
         return ApiResponse(
             code=0, 
             code=0, 
             message=f"成功删除 {affected_rows} 条文档数据", 
             message=f"成功删除 {affected_rows} 条文档数据", 
@@ -3701,30 +3853,90 @@ async def batch_delete_documents(req: BatchDeleteRequest, credentials: HTTPAutho
         if conn:
         if conn:
             conn.close()
             conn.close()
 
 
+async def simulate_conversion(doc_id: str):
+    """模拟文档转换过程"""
+    import time
+    conn = None
+    try:
+        conn = get_db_connection()
+        cursor = conn.cursor()
+        
+        # 1. 模拟开始 (10%)
+        cursor.execute("UPDATE t_document_main SET conversion_status = 1, conversion_progress = 10 WHERE id = %s", (doc_id,))
+        conn.commit()
+        time.sleep(2)
+        
+        # 2. 模拟进行中 (40%)
+        cursor.execute("UPDATE t_document_main SET conversion_progress = 40 WHERE id = %s", (doc_id,))
+        conn.commit()
+        time.sleep(3)
+        
+        # 3. 模拟进行中 (75%)
+        cursor.execute("UPDATE t_document_main SET conversion_progress = 75 WHERE id = %s", (doc_id,))
+        conn.commit()
+        time.sleep(2)
+        
+        # 4. 模拟完成 (100%)
+        cursor.execute("""
+            UPDATE t_document_main 
+            SET conversion_status = 2, conversion_progress = 100, 
+                converted_file_name = CONCAT(title, '_已转换.pdf') 
+            WHERE id = %s
+        """, (doc_id,))
+        conn.commit()
+        
+    except Exception as e:
+        print(f"模拟转换出错: {e}")
+        if conn:
+            cursor = conn.cursor()
+            cursor.execute("UPDATE t_document_main SET conversion_status = 3, conversion_error = %s WHERE id = %s", (str(e), doc_id))
+            conn.commit()
+    finally:
+        if conn:
+            conn.close()
+
 @app.post("/api/v1/documents/convert")
 @app.post("/api/v1/documents/convert")
-async def convert_document(req: ConvertRequest, credentials: HTTPAuthorizationCredentials = Depends(security)):
-    """异步启动文档转换"""
-    import subprocess
+async def convert_document(req: ConvertRequest, background_tasks: BackgroundTasks, credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """启动文档转换 (支持真实脚本与模拟逻辑)"""
     try:
     try:
         payload = verify_token(credentials.credentials)
         payload = verify_token(credentials.credentials)
         if not payload or not payload.get("is_superuser"):
         if not payload or not payload.get("is_superuser"):
             return ApiResponse(code=403, message="权限不足", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             return ApiResponse(code=403, message="权限不足", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
         
         
-        # 启动后台进程执行转换
-        # 脚本位于 d:\UGit\LQAdminPlatform\scripts\miner_u.py
+        table_type = req.table_type
+        # 如果没有提供 table_type,从主表查询
+        if not table_type:
+            try:
+                conn = get_db_connection()
+                if conn:
+                    cursor = conn.cursor()
+                    cursor.execute("SELECT source_type FROM t_document_main WHERE id = %s", (req.id,))
+                    res = cursor.fetchone()
+                    if res:
+                        table_type = res[0]
+                    cursor.close()
+                    conn.close()
+            except Exception as e:
+                print(f"从主表获取 source_type 失败: {e}")
+
+        # 1. 优先尝试启动真实转换脚本
         script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "scripts", "miner_u.py"))
         script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "scripts", "miner_u.py"))
-        # 使用当前 python 解释器
-        python_exe = sys.executable
-        
-        # 异步启动,不等待结束
-        subprocess.Popen([python_exe, script_path, str(req.table_type), str(req.id)], 
-                         stdout=subprocess.DEVNULL, 
-                         stderr=subprocess.DEVNULL,
-                         creationflags=subprocess.CREATE_NO_WINDOW if os.name == 'nt' else 0)
+        if os.path.exists(script_path):
+            import subprocess
+            python_exe = sys.executable
+            # 传递 table_type 和 id 给脚本
+            subprocess.Popen([python_exe, script_path, str(table_type or "basis"), str(req.id)], 
+                             stdout=subprocess.DEVNULL, 
+                             stderr=subprocess.DEVNULL,
+                             creationflags=subprocess.CREATE_NO_WINDOW if os.name == 'nt' else 0)
+            return ApiResponse(code=0, message="转换任务已在后台启动", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        
+        # 2. 如果脚本不存在,则启动模拟转换逻辑
+        background_tasks.add_task(simulate_conversion, str(req.id))
         
         
         return ApiResponse(
         return ApiResponse(
             code=0, 
             code=0, 
-            message="转换任务已启动", 
+            message="转换任务已启动 (模拟模式)", 
             timestamp=datetime.now(timezone.utc).isoformat()
             timestamp=datetime.now(timezone.utc).isoformat()
         ).model_dump()
         ).model_dump()
     except Exception as e:
     except Exception as e:
@@ -3733,155 +3945,469 @@ async def convert_document(req: ConvertRequest, credentials: HTTPAuthorizationCr
 
 
 @app.post("/api/v1/documents/add")
 @app.post("/api/v1/documents/add")
 async def add_document(doc: DocumentAdd, credentials: HTTPAuthorizationCredentials = Depends(security)):
 async def add_document(doc: DocumentAdd, credentials: HTTPAuthorizationCredentials = Depends(security)):
-    """添加新文档"""
+    """添加新文档 (同步主表和子表)"""
     try:
     try:
         payload = verify_token(credentials.credentials)
         payload = verify_token(credentials.credentials)
-        if not payload or not payload.get("is_superuser"):
-            return ApiResponse(code=403, message="权限不足", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        if not payload:
+            return ApiResponse(code=401, message="无效的访问令牌", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             
             
+        user_id = payload.get("username", "admin")
         conn = get_db_connection()
         conn = get_db_connection()
         if not conn:
         if not conn:
             return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
-        
+            
         cursor = conn.cursor()
         cursor = conn.cursor()
-        table_name = get_table_name(doc.table_type)
-        # 修正列名:reference_basis -> reference_basis_list
-        sql = f"""
-            INSERT INTO {table_name} 
-            (chinese_name, reference_basis_list, document_type, professional_field, release_date, created_at, updated_at)
-            VALUES (%s, %s, %s, %s, %s, NOW(), NOW())
-        """
-        # 构造日期:如果是年份,转为 YYYY-01-01
-        release_date = f"{doc.year}-01-01" if doc.year else None
-        
-        cursor.execute(sql, (doc.title, doc.content, str(doc.primary_category_id) if doc.primary_category_id else None, 
-                             str(doc.secondary_category_id) if doc.secondary_category_id else None, release_date))
-        conn.commit()
-        cursor.close()
-        conn.close()
+        doc_id = str(uuid.uuid4())
+        source_id = str(uuid.uuid4())
+        table_name = TABLE_MAP.get(doc.table_type, "t_basis_of_preparation")
         
         
-        return ApiResponse(code=0, message="文档添加成功", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        try:
+            # 1. 插入子表
+            if doc.table_type == 'basis':
+                cursor.execute(
+                    f"INSERT INTO {table_name} (id, chinese_name, created_by) VALUES (%s, %s, %s)",
+                    (source_id, doc.title, user_id)
+                )
+            elif doc.table_type == 'work':
+                cursor.execute(
+                    f"INSERT INTO {table_name} (id, plan_name, created_by) VALUES (%s, %s, %s)",
+                    (source_id, doc.title, user_id)
+                )
+            elif doc.table_type == 'job':
+                cursor.execute(
+                    f"INSERT INTO {table_name} (id, file_name, created_by) VALUES (%s, %s, %s)",
+                    (source_id, doc.title, user_id)
+                )
+                
+            # 2. 插入主表
+            cursor.execute("""
+                INSERT INTO t_document_main 
+                (id, title, content, created_by, source_type, source_id, whether_to_enter, primary_category_id, secondary_category_id, year, file_url, file_extension) 
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+            """, (doc_id, doc.title, doc.content, user_id, doc.table_type, source_id, 0, doc.primary_category_id, doc.secondary_category_id, doc.year, doc.file_url, doc.file_extension))
+            
+            conn.commit()
+            return ApiResponse(code=0, message="文档添加成功", data={"id": doc_id}, timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        except Exception as e:
+            conn.rollback()
+            raise e
+        finally:
+            cursor.close()
+            conn.close()
     except Exception as e:
     except Exception as e:
-        print(f"添加文档错误: {e}")
-        return ApiResponse(code=500, message=f"服务器内部错误: {str(e)}", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        print(f"添加文档失败: {e}")
+        return ApiResponse(code=500, message=str(e), timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+
+@app.get("/api/v1/documents/detail/{doc_id}")
+async def get_document_detail(doc_id: str, credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """获取文档详情 (关联查询子表)"""
+    print(f"🔍 正在获取文档详情: {doc_id}")
+    try:
+        payload = verify_token(credentials.credentials)
+        if not payload:
+            return ApiResponse(code=401, message="无效的访问令牌", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        conn = get_db_connection()
+        if not conn:
+            return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        cursor = conn.cursor()
+        try:
+            # 1. 查询主表
+            cursor.execute("SELECT * FROM t_document_main WHERE id = %s", (doc_id,))
+            main_row = cursor.fetchone()
+            if not main_row:
+                print(f"❌ 文档不存在: {doc_id}")
+                return ApiResponse(code=404, message="文档不存在", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+                
+            columns = [desc[0] for desc in cursor.description]
+            doc = dict(zip(columns, main_row))
+            print(f"✅ 找到主表数据: {doc.get('title')}")
+            
+            # 2. 查询子表
+            source_type = doc.get('source_type')
+            source_id = doc.get('source_id')
+            table_name = TABLE_MAP.get(source_type)
+            
+            if table_name and source_id:
+                cursor.execute(f"SELECT * FROM {table_name} WHERE id = %s", (source_id,))
+                sub_row = cursor.fetchone()
+                if sub_row:
+                    sub_columns = [desc[0] for desc in cursor.description]
+                    sub_data = dict(zip(sub_columns, sub_row))
+                    
+                    # 将子表字段映射到通用字段名,方便前端处理
+                    if source_type == 'basis':
+                        doc['standard_no'] = sub_data.get('standard_number')
+                        doc['issuing_authority'] = sub_data.get('issuing_authority')
+                        doc['release_date'] = str(sub_data.get('release_date')) if sub_data.get('release_date') else None
+                        doc['document_type'] = sub_data.get('document_type')
+                        doc['professional_field'] = sub_data.get('professional_field')
+                        doc['validity'] = sub_data.get('validity')
+                    elif source_type == 'work':
+                        doc['project_name'] = sub_data.get('project_name')
+                        doc['project_section'] = sub_data.get('project_section')
+                        doc['issuing_authority'] = sub_data.get('compiling_unit')
+                        doc['release_date'] = str(sub_data.get('compiling_date')) if sub_data.get('compiling_date') else None
+                    elif source_type == 'job':
+                        doc['issuing_authority'] = sub_data.get('issuing_department')
+                        doc['document_type'] = sub_data.get('document_type')
+                        doc['release_date'] = str(sub_data.get('publish_date')) if sub_data.get('publish_date') else None
+            
+            # 格式化主表时间
+            if doc.get('created_time'):
+                doc['created_time'] = doc['created_time'].isoformat()
+            if doc.get('updated_time'):
+                doc['updated_time'] = doc['updated_time'].isoformat()
+            if doc.get('release_date') and not isinstance(doc['release_date'], str):
+                doc['release_date'] = doc['release_date'].isoformat()
+
+            return ApiResponse(code=0, message="获取详情成功", data=doc, timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        finally:
+            cursor.close()
+            conn.close()
+    except Exception as e:
+        print(f"获取文档详情失败: {e}")
+        return ApiResponse(code=500, message=str(e), timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
 
 
 @app.get("/api/v1/documents/list")
 @app.get("/api/v1/documents/list")
 async def get_document_list(
 async def get_document_list(
-    primaryCategoryId: Optional[str] = None,
-    secondaryCategoryId: Optional[str] = None,
-    year: Optional[int] = None,
     whether_to_enter: Optional[int] = None,
     whether_to_enter: Optional[int] = None,
     keyword: Optional[str] = None,
     keyword: Optional[str] = None,
-    table_type: Optional[str] = "basis",
+    table_type: Optional[str] = None,
     page: int = 1, 
     page: int = 1, 
     size: int = 50,
     size: int = 50,
-    sort_by: str = "created_at",
     credentials: HTTPAuthorizationCredentials = Depends(security)
     credentials: HTTPAuthorizationCredentials = Depends(security)
 ):
 ):
-    """获取文档列表(支持过滤与搜索)"""
-    conn = None
-    cursor = None
+    """获取文档列表 (从主表查询)"""
     try:
     try:
         payload = verify_token(credentials.credentials)
         payload = verify_token(credentials.credentials)
-        if not payload or not payload.get("is_superuser"):
-            return ApiResponse(code=403, message="权限不足", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        if not payload:
+            return ApiResponse(code=401, message="无效的访问令牌", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             
             
         conn = get_db_connection()
         conn = get_db_connection()
         if not conn:
         if not conn:
             return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
             return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
-        
+            
         cursor = conn.cursor()
         cursor = conn.cursor()
-        table_name = get_table_name(table_type)
-        
         where_clauses = []
         where_clauses = []
         params = []
         params = []
         
         
-        if primaryCategoryId:
-            where_clauses.append("document_type = %s")
-            params.append(primaryCategoryId)
-        if secondaryCategoryId:
-            where_clauses.append("professional_field = %s")
-            params.append(secondaryCategoryId)
-        if year:
-            where_clauses.append("YEAR(release_date) = %s")
-            params.append(year)
+        if table_type:
+            where_clauses.append("source_type = %s")
+            params.append(table_type)
         if whether_to_enter is not None:
         if whether_to_enter is not None:
-            where_clauses.append("CAST(whether_to_enter AS UNSIGNED) = %s")
+            where_clauses.append("whether_to_enter = %s")
             params.append(whether_to_enter)
             params.append(whether_to_enter)
         if keyword:
         if keyword:
-            where_clauses.append("(chinese_name LIKE %s OR reference_basis_list LIKE %s OR standard_no LIKE %s)")
-            like_keyword = f"%{keyword}%"
-            params.extend([like_keyword, like_keyword, like_keyword])
+            where_clauses.append("(title LIKE %s OR content LIKE %s)")
+            params.extend([f"%{keyword}%", f"%{keyword}%"])
             
             
-        where_stmt = " WHERE " + " AND ".join(where_clauses) if where_clauses else ""
-        
-        # 排序逻辑:按创建时间倒序
-        sort_field = "created_at" if sort_by == "created_at" else "updated_at"
-        order_by = f"ORDER BY {sort_field} DESC"
-        
-        # 分页
+        where_sql = " WHERE " + " AND ".join(where_clauses) if where_clauses else ""
         offset = (page - 1) * size
         offset = (page - 1) * size
         
         
-        # 返回更多字段
-        sql = f"""
-            SELECT id, chinese_name as title, reference_basis_list as content, 
-            document_type, professional_field, 
-            YEAR(release_date) as year, release_date, standard_no, status, 
-            CAST(whether_to_enter AS UNSIGNED) as whether_to_enter, file_url,
-            conversion_status, conversion_progress, conversion_error,
-            created_at, updated_at 
-            FROM {table_name} {where_stmt} 
-            {order_by} LIMIT %s OFFSET %s
-        """
+        sql = f"SELECT * FROM t_document_main {where_sql} ORDER BY created_time DESC LIMIT %s OFFSET %s"
         params.extend([size, offset])
         params.extend([size, offset])
         
         
-        cursor.execute(sql, params)
-        columns = [col[0] for col in cursor.description]
-        items = [dict(zip(columns, row)) for row in cursor.fetchall()]
-        
-        # 格式化时间
-        for item in items:
-            for key, value in item.items():
-                if isinstance(value, (datetime, date)):
-                    item[key] = value.isoformat()
-        
-        # 获取总数
-        count_sql = f"SELECT COUNT(*) FROM {table_name} {where_stmt}"
-        cursor.execute(count_sql, params[:-2])
+        cursor.execute(sql, tuple(params))
+        columns = [desc[0] for desc in cursor.description]
+        items = []
+        for row in cursor.fetchall():
+            item = dict(zip(columns, row))
+            # 格式化时间
+            for key in ['created_time', 'updated_time', 'release_date']:
+                if item.get(key) and hasattr(item[key], 'isoformat'):
+                    item[key] = item[key].isoformat()
+            items.append(item)
+            
+        # 总数
+        count_sql = f"SELECT COUNT(*) FROM t_document_main {where_sql}"
+        cursor.execute(count_sql, tuple(params[:-2]))
         total = cursor.fetchone()[0]
         total = cursor.fetchone()[0]
         
         
-        # 优化统计查询:合并全局总数和已入库总数的查询,减少数据库交互
-        stats_sql = f"SELECT COUNT(*), SUM(CASE WHEN CAST(whether_to_enter AS UNSIGNED) = 1 THEN 1 ELSE 0 END) FROM {table_name}"
-        cursor.execute(stats_sql)
-        stats_result = cursor.fetchone()
+        # 统计数据
+        cursor.execute("SELECT COUNT(*) FROM t_document_main")
+        all_total = cursor.fetchone()[0]
+        cursor.execute("SELECT COUNT(*) FROM t_document_main WHERE whether_to_enter = 1")
+        total_entered = cursor.fetchone()[0]
         
         
-        all_total = 0
-        total_entered = 0
-        if stats_result:
-            all_total = stats_result[0] or 0
-            total_entered = int(stats_result[1] or 0)
+        cursor.close()
+        conn.close()
         
         
         return ApiResponse(
         return ApiResponse(
-            code=0,
-            message="获取成功",
+            code=0, 
+            message="查询成功", 
             data={
             data={
                 "items": items, 
                 "items": items, 
                 "total": total, 
                 "total": total, 
-                "all_total": all_total,
-                "total_entered": total_entered,
                 "page": page, 
                 "page": page, 
-                "size": size
+                "size": size,
+                "all_total": all_total,
+                "total_entered": total_entered
             },
             },
             timestamp=datetime.now(timezone.utc).isoformat()
             timestamp=datetime.now(timezone.utc).isoformat()
         ).model_dump()
         ).model_dump()
     except Exception as e:
     except Exception as e:
-        print(f"获取文档列表错误: {e}")
-        return ApiResponse(code=500, message=f"服务器内部错误: {str(e)}", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
-    finally:
-        if cursor:
+        print(f"获取文档列表失败: {e}")
+        return ApiResponse(code=500, message=str(e), timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+
+@app.post("/api/v1/documents/edit")
+async def edit_document(doc: DocumentAdd, credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """编辑文档 (同步主表和子表)"""
+    try:
+        payload = verify_token(credentials.credentials)
+        if not payload:
+            return ApiResponse(code=401, message="无效的访问令牌", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        if not doc.id or not doc.source_id:
+            return ApiResponse(code=400, message="缺少ID参数", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        conn = get_db_connection()
+        if not conn:
+            return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        cursor = conn.cursor()
+        table_name = TABLE_MAP.get(doc.table_type, "t_basis_of_preparation")
+        
+        try:
+            # 1. 更新子表内容
+            if doc.table_type == 'basis':
+                cursor.execute(f"""
+                    UPDATE {table_name} 
+                    SET chinese_name = %s, standard_number = %s, issuing_authority = %s, 
+                        release_date = %s, document_type = %s, professional_field = %s, validity = %s
+                    WHERE id = %s
+                """, (doc.title, doc.standard_no, doc.issuing_authority, doc.release_date, 
+                      doc.document_type, doc.professional_field, doc.validity, doc.source_id))
+            elif doc.table_type == 'work':
+                cursor.execute(f"""
+                    UPDATE {table_name} 
+                    SET plan_name = %s, project_name = %s, project_section = %s, 
+                        compiling_unit = %s, compiling_date = %s
+                    WHERE id = %s
+                """, (doc.title, doc.project_name, doc.project_section, doc.issuing_authority, 
+                      doc.release_date, doc.source_id))
+            elif doc.table_type == 'job':
+                cursor.execute(f"""
+                    UPDATE {table_name} 
+                    SET file_name = %s, issuing_department = %s, document_type = %s, publish_date = %s
+                    WHERE id = %s
+                """, (doc.title, doc.issuing_authority, doc.document_type, doc.release_date, doc.source_id))
+                
+            # 2. 更新主表内容
+            cursor.execute("""
+                UPDATE t_document_main 
+                SET title = %s, content = %s, updated_time = NOW(),
+                    primary_category_id = %s, secondary_category_id = %s, year = %s,
+                    file_url = %s, file_extension = %s
+                WHERE id = %s
+            """, (doc.title, doc.content, doc.primary_category_id, doc.secondary_category_id, doc.year, 
+                  doc.file_url, doc.file_extension, doc.id))
+            
+            conn.commit()
+            return ApiResponse(code=0, message="文档更新成功", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        except Exception as e:
+            conn.rollback()
+            raise e
+        finally:
             cursor.close()
             cursor.close()
-        if conn:
             conn.close()
             conn.close()
+    except Exception as e:
+        print(f"编辑文档失败: {e}")
+        return ApiResponse(code=500, message=str(e), timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+
+@app.post("/api/v1/documents/enter")
+async def enter_document(data: dict, credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """文档入库"""
+    try:
+        doc_id = data.get("id")
+        if not doc_id:
+            return ApiResponse(code=400, message="缺少ID", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        payload = verify_token(credentials.credentials)
+        username = payload.get("username", "admin") if payload else "admin"
+        
+        conn = get_db_connection()
+        if not conn:
+            return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        cursor = conn.cursor()
+        
+        # 1. 更新主表
+        cursor.execute("UPDATE t_document_main SET whether_to_enter = 1, updated_time = NOW() WHERE id = %s", (doc_id,))
+        
+        # 2. 尝试同步更新子表
+        try:
+            cursor.execute("SELECT source_type, source_id FROM t_document_main WHERE id = %s", (doc_id,))
+            res = cursor.fetchone()
+            if res and res[0] and res[1]:
+                s_type, s_id = res
+                sub_table = get_table_name(s_type)
+                if sub_table:
+                    sub_sql = f"UPDATE {sub_table} SET whether_to_enter = 1, updated_at = NOW(), updated_by = %s WHERE id = %s"
+                    try:
+                        cursor.execute(sub_sql, (username, s_id))
+                    except Exception as sub_e:
+                        print(f"入库同步子表 {sub_table} 失败: {sub_e}")
+        except Exception as sync_e:
+            print(f"入库同步子表异常: {sync_e}")
+            
+        conn.commit()
+        cursor.close()
+        conn.close()
+        return ApiResponse(code=0, message="入库成功", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+    except Exception as e:
+        print(f"入库失败: {e}")
+        return ApiResponse(code=500, message=str(e), timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+
+@app.get("/api/v1/basic-info/list")
+async def get_basic_info_list(
+    type: str,
+    page: int = 1,
+    size: int = 50,
+    keyword: Optional[str] = None,
+    title: Optional[str] = None,
+    standard_no: Optional[str] = None,
+    document_type: Optional[str] = None,
+    professional_field: Optional[str] = None,
+    validity: Optional[str] = None,
+    issuing_authority: Optional[str] = None,
+    release_date_start: Optional[str] = None,
+    release_date_end: Optional[str] = None,
+    credentials: HTTPAuthorizationCredentials = Depends(security)
+):
+    """获取基本信息列表 (支持多条件检索)"""
+    try:
+        payload = verify_token(credentials.credentials)
+        if not payload:
+            return ApiResponse(code=401, message="无效的访问令牌", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        conn = get_db_connection()
+        if not conn:
+            return ApiResponse(code=500, message="数据库连接失败", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+        
+        cursor = conn.cursor()
+        
+        # 根据类型选择表名和字段映射
+        if type == 'basis':
+            table_name = "t_basis_of_preparation"
+            fields = "id, chinese_name as title, standard_number as standard_no, issuing_authority, release_date, document_type, professional_field, validity, created_by, created_time as created_at"
+            # 字段名映射供过滤使用
+            field_map = {
+                'title': 'chinese_name',
+                'standard_no': 'standard_number',
+                'issuing_authority': 'issuing_authority',
+                'release_date': 'release_date',
+                'document_type': 'document_type',
+                'professional_field': 'professional_field',
+                'validity': 'validity'
+            }
+        elif type == 'work':
+            table_name = "t_work_of_preparation"
+            fields = "id, plan_name as title, NULL as standard_no, compiling_unit as issuing_authority, compiling_date as release_date, NULL as document_type, NULL as professional_field, NULL as validity, created_by, created_time as created_at"
+            field_map = {
+                'title': 'plan_name',
+                'issuing_authority': 'compiling_unit',
+                'release_date': 'compiling_date'
+            }
+        elif type == 'job':
+            table_name = "t_job_of_preparation"
+            fields = "id, file_name as title, NULL as standard_no, issuing_department as issuing_authority, publish_date as release_date, document_type, NULL as professional_field, NULL as validity, created_by, created_time as created_at"
+            field_map = {
+                'title': 'file_name',
+                'issuing_authority': 'issuing_department',
+                'release_date': 'publish_date',
+                'document_type': 'document_type'
+            }
+        else:
+            return ApiResponse(code=400, message="无效的类型", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
+            
+        where_clauses = []
+        params = []
+        
+        # 1. 统一关键字搜索 (保持兼容)
+        if keyword:
+            if type == 'basis':
+                where_clauses.append("(chinese_name LIKE %s OR standard_number LIKE %s)")
+                params.extend([f"%{keyword}%", f"%{keyword}%"])
+            elif type == 'work':
+                where_clauses.append("plan_name LIKE %s")
+                params.append(f"%{keyword}%")
+            elif type == 'job':
+                where_clauses.append("file_name LIKE %s")
+                params.append(f"%{keyword}%")
+
+        # 2. 精细化检索
+        if title and 'title' in field_map:
+            where_clauses.append(f"{field_map['title']} LIKE %s")
+            params.append(f"%{title}%")
+        
+        if standard_no and 'standard_no' in field_map:
+            where_clauses.append(f"{field_map['standard_no']} LIKE %s")
+            params.append(f"%{standard_no}%")
+            
+        if document_type and 'document_type' in field_map:
+            where_clauses.append(f"{field_map['document_type']} = %s")
+            params.append(document_type)
+            
+        if professional_field and 'professional_field' in field_map:
+            where_clauses.append(f"{field_map['professional_field']} = %s")
+            params.append(professional_field)
+            
+        if validity and 'validity' in field_map:
+            where_clauses.append(f"{field_map['validity']} = %s")
+            params.append(validity)
+            
+        if issuing_authority and 'issuing_authority' in field_map:
+            where_clauses.append(f"{field_map['issuing_authority']} LIKE %s")
+            params.append(f"%{issuing_authority}%")
+            
+        if release_date_start and 'release_date' in field_map:
+            where_clauses.append(f"{field_map['release_date']} >= %s")
+            params.append(release_date_start)
+            
+        if release_date_end and 'release_date' in field_map:
+            where_clauses.append(f"{field_map['release_date']} <= %s")
+            params.append(release_date_end)
+                
+        where_sql = " WHERE " + " AND ".join(where_clauses) if where_clauses else ""
+        
+        # 分页查询
+        offset = (page - 1) * size
+        sql = f"SELECT {fields} FROM {table_name}{where_sql} ORDER BY created_at DESC LIMIT %s OFFSET %s"
+        params.extend([size, offset])
+        
+        cursor.execute(sql, tuple(params))
+        columns = [desc[0] for desc in cursor.description]
+        items = []
+        for row in cursor.fetchall():
+            item = dict(zip(columns, row))
+            # 格式化日期
+            for key in ['release_date', 'created_at']:
+                if item.get(key) and hasattr(item[key], 'isoformat'):
+                    item[key] = item[key].isoformat()
+                elif item.get(key):
+                    item[key] = str(item[key])
+            items.append(item)
+            
+        # 总数查询
+        count_sql = f"SELECT COUNT(*) FROM {table_name}{where_sql}"
+        cursor.execute(count_sql, tuple(params[:-2]))
+        total = cursor.fetchone()[0]
+        
+        cursor.close()
+        conn.close()
+        
+        return ApiResponse(
+            code=0,
+            message="查询成功",
+            data={"items": items, "total": total, "page": page, "size": size},
+            timestamp=datetime.now(timezone.utc).isoformat()
+        ).model_dump()
+        
+    except Exception as e:
+        print(f"查询基本信息失败: {e}")
+        return ApiResponse(code=500, message=f"服务器内部错误: {str(e)}", timestamp=datetime.now(timezone.utc).isoformat()).model_dump()
 
 
 @app.get("/api/v1/documents/categories/primary")
 @app.get("/api/v1/documents/categories/primary")
 async def get_primary_categories(credentials: HTTPAuthorizationCredentials = Depends(security)):
 async def get_primary_categories(credentials: HTTPAuthorizationCredentials = Depends(security)):

+ 3 - 1
fix_db_indexes.py → scripts/fix_db_indexes.py

@@ -3,7 +3,9 @@ import pymysql
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 from dotenv import load_dotenv
 from dotenv import load_dotenv
 
 
-load_dotenv()
+# 加载环境变量 - 配置文件在脚本所在目录的上一级
+env_path = os.path.join(os.path.dirname(__file__), "..", ".env")
+load_dotenv(dotenv_path=env_path)
 
 
 def fix_indexes():
 def fix_indexes():
     """执行索引添加 SQL"""
     """执行索引添加 SQL"""

+ 72 - 29
scripts/miner_u.py

@@ -44,7 +44,7 @@ def get_db_connection():
         print(f"Database connection error: {e}")
         print(f"Database connection error: {e}")
         return None
         return None
 
 
-def update_db_status(table_name, doc_id, status=None, progress=None, error=None):
+def update_db_status(doc_id, status=None, progress=None, error=None, converted_file_name=None):
     conn = get_db_connection()
     conn = get_db_connection()
     if not conn:
     if not conn:
         return
         return
@@ -61,11 +61,17 @@ def update_db_status(table_name, doc_id, status=None, progress=None, error=None)
             if error is not None:
             if error is not None:
                 updates.append("conversion_error = %s")
                 updates.append("conversion_error = %s")
                 params.append(error)
                 params.append(error)
+            if converted_file_name is not None:
+                updates.append("converted_file_name = %s")
+                params.append(converted_file_name)
             
             
             if not updates:
             if not updates:
                 return
                 return
+            
+            # 同时更新修改时间
+            updates.append("updated_time = NOW()")
                 
                 
-            sql = f"UPDATE {table_name} SET {', '.join(updates)} WHERE id = %s"
+            sql = f"UPDATE t_document_main SET {', '.join(updates)} WHERE id = %s"
             params.append(doc_id)
             params.append(doc_id)
             cursor.execute(sql, params)
             cursor.execute(sql, params)
     except Exception as e:
     except Exception as e:
@@ -109,10 +115,10 @@ def poll_batch(batch_id, interval_sec=5, timeout_sec=1800):
             raise TimeoutError(f"poll timeout for batch_id={batch_id}")
             raise TimeoutError(f"poll timeout for batch_id={batch_id}")
         time.sleep(interval_sec)
         time.sleep(interval_sec)
 
 
-def process_document(table_name, doc_id, chinese_name, file_url, out_dir):
+def process_document(doc_id, chinese_name, file_url, out_dir):
     try:
     try:
         # 1. 更新状态:开始转换
         # 1. 更新状态:开始转换
-        update_db_status(table_name, doc_id, status=1, progress=10)
+        update_db_status(doc_id, status=1, progress=10)
         
         
         # 2. 下载原始文件
         # 2. 下载原始文件
         print(f"Downloading {file_url}...")
         print(f"Downloading {file_url}...")
@@ -120,19 +126,24 @@ def process_document(table_name, doc_id, chinese_name, file_url, out_dir):
         resp.raise_for_status()
         resp.raise_for_status()
         file_content = resp.content
         file_content = resp.content
         
         
+        # 检查文件类型
+        content_type = resp.headers.get("Content-Type", "").lower()
+        if "text/html" in content_type:
+            raise RuntimeError("不支持对网页链接进行转换,请直接查看原链接。")
+        
         file_ext = Path(urlparse(file_url).path).suffix.lower()
         file_ext = Path(urlparse(file_url).path).suffix.lower()
         if not file_ext:
         if not file_ext:
             file_ext = ".pdf" # Default
             file_ext = ".pdf" # Default
             
             
         file_name = f"{chinese_name}{file_ext}"
         file_name = f"{chinese_name}{file_ext}"
-        update_db_status(table_name, doc_id, progress=30)
+        update_db_status(doc_id, progress=30)
         
         
         # 3. 提交到 MinerU
         # 3. 提交到 MinerU
         files_meta = [{"name": file_name, "data_id": doc_id}]
         files_meta = [{"name": file_name, "data_id": doc_id}]
         batch_id, upload_urls = apply_upload_urls(files_meta)
         batch_id, upload_urls = apply_upload_urls(files_meta)
         upload_files([file_content], upload_urls)
         upload_files([file_content], upload_urls)
         
         
-        update_db_status(table_name, doc_id, progress=50)
+        update_db_status(doc_id, progress=50)
         
         
         # 4. 轮询结果
         # 4. 轮询结果
         results = poll_batch(batch_id)
         results = poll_batch(batch_id)
@@ -142,23 +153,24 @@ def process_document(table_name, doc_id, chinese_name, file_url, out_dir):
             zip_url = result.get("full_zip_url")
             zip_url = result.get("full_zip_url")
             if zip_url:
             if zip_url:
                 # 5. 下载并处理结果
                 # 5. 下载并处理结果
-                update_db_status(table_name, doc_id, progress=80)
+                update_db_status(doc_id, progress=80)
                 zip_resp = requests.get(zip_url, timeout=300)
                 zip_resp = requests.get(zip_url, timeout=300)
                 zip_resp.raise_for_status()
                 zip_resp.raise_for_status()
                 
                 
                 # 解压并保存 Markdown
                 # 解压并保存 Markdown
+                converted_file_name = f"{chinese_name}.md"
                 with zipfile.ZipFile(io.BytesIO(zip_resp.content)) as z:
                 with zipfile.ZipFile(io.BytesIO(zip_resp.content)) as z:
                     # 查找 .md 文件
                     # 查找 .md 文件
                     md_files = [f for f in z.namelist() if f.endswith(".md")]
                     md_files = [f for f in z.namelist() if f.endswith(".md")]
                     if md_files:
                     if md_files:
                         md_content = z.read(md_files[0])
                         md_content = z.read(md_files[0])
-                        save_path = Path(out_dir) / f"{chinese_name}.md"
+                        save_path = Path(out_dir) / converted_file_name
                         save_path.parent.mkdir(parents=True, exist_ok=True)
                         save_path.parent.mkdir(parents=True, exist_ok=True)
                         with open(save_path, "wb") as f:
                         with open(save_path, "wb") as f:
                             f.write(md_content)
                             f.write(md_content)
                         print(f"Saved Markdown to {save_path}")
                         print(f"Saved Markdown to {save_path}")
                 
                 
-                update_db_status(table_name, doc_id, status=2, progress=100)
+                update_db_status(doc_id, status=2, progress=100, converted_file_name=converted_file_name)
                 return True
                 return True
             else:
             else:
                 raise RuntimeError("No zip URL in result")
                 raise RuntimeError("No zip URL in result")
@@ -168,19 +180,11 @@ def process_document(table_name, doc_id, chinese_name, file_url, out_dir):
             
             
     except Exception as e:
     except Exception as e:
         print(f"Process failed: {e}")
         print(f"Process failed: {e}")
-        update_db_status(table_name, doc_id, status=3, error=str(e))
+        update_db_status(doc_id, status=3, error=str(e))
         return False
         return False
 
 
-def main_cli(table_type, doc_id, out_dir=r"d:\UGit\MinerU"):
-    # 获取表名
-    TABLE_MAP = {
-        "basis": "t_basis_of_preparation",
-        "work": "t_work_of_preparation",
-        "job": "t_job_of_preparation"
-    }
-    table_name = TABLE_MAP.get(table_type, "t_basis_of_preparation")
-    
-    # 从数据库获取详细信息
+def main_cli(doc_id, out_dir=r"d:\UGit\MinerU"):
+    # 从数据库获取详细信息 - 直接从 t_document_main 获取
     conn = get_db_connection()
     conn = get_db_connection()
     if not conn:
     if not conn:
         print("Database connection failed")
         print("Database connection failed")
@@ -188,21 +192,60 @@ def main_cli(table_type, doc_id, out_dir=r"d:\UGit\MinerU"):
         
         
     try:
     try:
         with conn.cursor() as cursor:
         with conn.cursor() as cursor:
-            cursor.execute(f"SELECT chinese_name, file_url FROM {table_name} WHERE id = %s", (doc_id,))
+            # 优先从 t_document_main 获取 title 和 file_url
+            cursor.execute("SELECT title, file_url FROM t_document_main WHERE id = %s", (doc_id,))
             row = cursor.fetchone()
             row = cursor.fetchone()
-            if not row:
-                print(f"Document not found: {doc_id} in {table_name}")
+            if not row or not row[1]: # 如果主表没有 file_url,尝试从子表获取
+                if not row:
+                    print(f"Document not found: {doc_id}")
+                    return
+                
+                title = row[0]
+                # 尝试从子表获取 (兼容旧数据)
+                cursor.execute("SELECT source_type, source_id FROM t_document_main WHERE id = %s", (doc_id,))
+                st_row = cursor.fetchone()
+                if st_row:
+                    source_type, source_id = st_row
+                    TABLE_MAP = {
+                        "basis": "t_basis_of_preparation",
+                        "work": "t_work_of_preparation",
+                        "job": "t_job_of_preparation"
+                    }
+                    table_name = TABLE_MAP.get(source_type)
+                    if table_name:
+                        # 尝试不同的 url 字段名
+                        url_fields = ['file_url', 'source_url', 'url']
+                        for field in url_fields:
+                            try:
+                                cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = %s", (source_id,))
+                                url_row = cursor.fetchone()
+                                if url_row and url_row[0]:
+                                    file_url = url_row[0]
+                                    process_document(doc_id, title, file_url, out_dir)
+                                    return
+                            except:
+                                continue
+                
+                print(f"No file_url found for document: {doc_id}")
+                update_db_status(doc_id, status=3, error="未找到文件链接(file_url)")
                 return
                 return
-            chinese_name, file_url = row
             
             
-        process_document(table_name, doc_id, chinese_name, file_url, out_dir)
+            title, file_url = row
+            process_document(doc_id, title, file_url, out_dir)
     finally:
     finally:
         conn.close()
         conn.close()
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    # 示例用法:python miner_u.py basis <doc_id>
+    # 示例用法:python miner_u.py <doc_id>
     import sys
     import sys
-    if len(sys.argv) > 2:
-        main_cli(sys.argv[1], sys.argv[2])
+    if len(sys.argv) > 1:
+        # 这里的参数处理需要微调,因为以前是 python miner_u.py <table_type> <doc_id>
+        # 现在我们只需要 <doc_id>,但为了兼容性,我们可以检查参数个数
+        if len(sys.argv) == 3:
+            # 旧格式: python miner_u.py basis <doc_id>
+            main_cli(sys.argv[2])
+        else:
+            # 新格式: python miner_u.py <doc_id>
+            main_cli(sys.argv[1])
     else:
     else:
-        print("Usage: python miner_u.py <table_type> <doc_id>")
+        print("Usage: python miner_u.py <doc_id>")