CRBC-MaaS-Platform-Project
/
LabelingSystem


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
							"""
External API Service.
Provides business logic for external system integration.
"""
import uuid
import json
import logging
from datetime import datetime
from typing import Optional, List, Dict, Any

from database import get_db_connection
from schemas.external import (
    TaskType, ProjectInitRequest, ProjectInitResponse,
    ProgressResponse, AnnotatorProgress,
    ExternalExportFormat, ExternalExportRequest, ExternalExportResponse,
    TaskDataItem
)

logger = logging.getLogger(__name__)


# 默认XML配置模板（不含标签，由管理员后续配置）
DEFAULT_CONFIGS = {
    TaskType.TEXT_CLASSIFICATION: """<View>
  <Text name="text" value="$text"/>
  <Choices name="label" toName="text" choice="single">
    <!-- 标签由管理员配置 -->
  </Choices>
</View>""",
    TaskType.IMAGE_CLASSIFICATION: """<View>
  <Image name="image" value="$image"/>
  <Choices name="label" toName="image" choice="single">
    <!-- 标签由管理员配置 -->
  </Choices>
</View>""",
    TaskType.OBJECT_DETECTION: """<View>
  <Image name="image" value="$image"/>
  <RectangleLabels name="label" toName="image">
    <!-- 标签由管理员配置 -->
  </RectangleLabels>
</View>""",
    TaskType.NER: """<View>
  <Text name="text" value="$text"/>
  <Labels name="label" toName="text">
    <!-- 标签由管理员配置 -->
  </Labels>
</View>"""
}


class ExternalService:
    """对外API服务类"""
    
    @staticmethod
    def get_default_config(task_type: TaskType) -> str:
        """获取任务类型对应的默认XML配置"""
        return DEFAULT_CONFIGS.get(task_type, DEFAULT_CONFIGS[TaskType.TEXT_CLASSIFICATION])
    
    @staticmethod
    def init_project(request: ProjectInitRequest, user_id: str) -> ProjectInitResponse:
        """
        初始化项目并创建任务
        
        Args:
            request: 项目初始化请求
            user_id: 创建者用户ID
            
        Returns:
            ProjectInitResponse: 项目初始化响应
        """
        # 生成项目ID
        project_id = f"proj_{uuid.uuid4().hex[:12]}"
        
        # 获取默认配置
        config = ExternalService.get_default_config(request.task_type)
        
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            # 创建项目
            cursor.execute("""
                INSERT INTO projects (id, name, description, config, status, source, task_type, external_id, updated_at)
                VALUES (?, ?, ?, ?, 'draft', 'external', ?, ?, CURRENT_TIMESTAMP)
            """, (
                project_id,
                request.name,
                request.description or "",
                config,
                request.task_type.value,
                request.external_id
            ))
            
            # 创建任务
            task_count = 0
            for i, item in enumerate(request.data):
                task_id = f"task_{uuid.uuid4().hex[:12]}"
                task_name = f"Task {i + 1}"
                
                # 根据任务类型构建数据格式
                if request.task_type in [TaskType.TEXT_CLASSIFICATION, TaskType.NER]:
                    task_data = {
                        "text": item.content,
                        "external_id": item.id,
                        "metadata": item.metadata or {}
                    }
                else:
                    task_data = {
                        "image": item.content,
                        "external_id": item.id,
                        "metadata": item.metadata or {}
                    }
                
                cursor.execute("""
                    INSERT INTO tasks (id, project_id, name, data, status)
                    VALUES (?, ?, ?, ?, 'pending')
                """, (
                    task_id,
                    project_id,
                    task_name,
                    json.dumps(task_data)
                ))
                task_count += 1
            
            # 获取创建时间
            cursor.execute("SELECT created_at FROM projects WHERE id = ?", (project_id,))
            row = cursor.fetchone()
            created_at = row["created_at"] if row else datetime.now()
            
            return ProjectInitResponse(
                project_id=project_id,
                project_name=request.name,
                task_count=task_count,
                status="draft",
                created_at=created_at,
                config=config,
                external_id=request.external_id
            )
    
    @staticmethod
    def get_project_progress(project_id: str) -> Optional[ProgressResponse]:
        """
        获取项目进度
        
        Args:
            project_id: 项目ID
            
        Returns:
            ProgressResponse: 进度响应，如果项目不存在返回None
        """
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            # 获取项目信息
            cursor.execute("""
                SELECT id, name, status, updated_at
                FROM projects
                WHERE id = ?
            """, (project_id,))
            
            project = cursor.fetchone()
            if not project:
                return None
            
            # 获取任务统计
            cursor.execute("""
                SELECT 
                    COUNT(*) as total,
                    SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
                    SUM(CASE WHEN status = 'in_progress' THEN 1 ELSE 0 END) as in_progress,
                    SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending
                FROM tasks
                WHERE project_id = ?
            """, (project_id,))
            
            stats = cursor.fetchone()
            total_tasks = stats["total"] or 0
            completed_tasks = stats["completed"] or 0
            in_progress_tasks = stats["in_progress"] or 0
            pending_tasks = stats["pending"] or 0
            
            # 计算完成百分比
            completion_percentage = 0.0
            if total_tasks > 0:
                completion_percentage = round((completed_tasks / total_tasks) * 100, 2)
            
            # 获取标注人员统计
            cursor.execute("""
                SELECT 
                    t.assigned_to,
                    u.username,
                    COUNT(*) as assigned_count,
                    SUM(CASE WHEN t.status = 'completed' THEN 1 ELSE 0 END) as completed_count,
                    SUM(CASE WHEN t.status = 'in_progress' THEN 1 ELSE 0 END) as in_progress_count
                FROM tasks t
                LEFT JOIN users u ON t.assigned_to = u.id
                WHERE t.project_id = ? AND t.assigned_to IS NOT NULL
                GROUP BY t.assigned_to, u.username
            """, (project_id,))
            
            annotators = []
            for row in cursor.fetchall():
                assigned_count = row["assigned_count"] or 0
                completed_count = row["completed_count"] or 0
                completion_rate = 0.0
                if assigned_count > 0:
                    completion_rate = round((completed_count / assigned_count) * 100, 2)
                
                annotators.append(AnnotatorProgress(
                    user_id=row["assigned_to"] or "",
                    username=row["username"] or "Unknown",
                    assigned_count=assigned_count,
                    completed_count=completed_count,
                    in_progress_count=row["in_progress_count"] or 0,
                    completion_rate=completion_rate
                ))
            
            return ProgressResponse(
                project_id=project_id,
                project_name=project["name"],
                status=project["status"] or "draft",
                total_tasks=total_tasks,
                completed_tasks=completed_tasks,
                in_progress_tasks=in_progress_tasks,
                pending_tasks=pending_tasks,
                completion_percentage=completion_percentage,
                annotators=annotators,
                last_updated=project["updated_at"]
            )
    
    @staticmethod
    def check_project_exists(project_id: str) -> bool:
        """检查项目是否存在"""
        with get_db_connection() as conn:
            cursor = conn.cursor()
            cursor.execute("SELECT id FROM projects WHERE id = ?", (project_id,))
            return cursor.fetchone() is not None


    @staticmethod
    def export_project_data(
        project_id: str,
        request: ExternalExportRequest,
        base_url: str = ""
    ) -> Optional[ExternalExportResponse]:
        """
        导出项目数据
        
        Args:
            project_id: 项目ID
            request: 导出请求
            base_url: 基础URL，用于生成下载链接
            
        Returns:
            ExternalExportResponse: 导出响应，如果项目不存在返回None
        """
        import os
        from datetime import timedelta
        
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            # 检查项目是否存在
            cursor.execute("SELECT id, name FROM projects WHERE id = ?", (project_id,))
            project = cursor.fetchone()
            if not project:
                return None
            
            # 构建查询条件
            status_filter = ""
            if request.completed_only:
                status_filter = "AND t.status = 'completed'"
            
            # 获取任务和标注数据
            cursor.execute(f"""
                SELECT 
                    t.id as task_id,
                    t.data,
                    t.status,
                    t.assigned_to,
                    u.username as annotator_name,
                    a.id as annotation_id,
                    a.result as annotation_result,
                    a.updated_at as annotation_time
                FROM tasks t
                LEFT JOIN users u ON t.assigned_to = u.id
                LEFT JOIN annotations a ON t.id = a.task_id
                WHERE t.project_id = ? {status_filter}
                ORDER BY t.id
            """, (project_id,))
            
            rows = cursor.fetchall()
            
            # 组织数据
            tasks_data = {}
            for row in rows:
                task_id = row["task_id"]
                if task_id not in tasks_data:
                    task_data = json.loads(row["data"]) if row["data"] else {}
                    tasks_data[task_id] = {
                        "task_id": task_id,
                        "external_id": task_data.get("external_id"),
                        "original_data": task_data,
                        "annotations": [],
                        "status": row["status"],
                        "annotator": row["annotator_name"],
                        "completed_at": None
                    }
                
                if row["annotation_id"]:
                    annotation_result = json.loads(row["annotation_result"]) if row["annotation_result"] else {}
                    tasks_data[task_id]["annotations"].append(annotation_result)
                    if row["annotation_time"]:
                        tasks_data[task_id]["completed_at"] = row["annotation_time"]
            
            # 转换为列表
            export_data = list(tasks_data.values())
            total_exported = len(export_data)
            
            # 生成导出文件
            export_id = f"export_{uuid.uuid4().hex[:12]}"
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            
            # 确保导出目录存在
            export_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "exports")
            os.makedirs(export_dir, exist_ok=True)
            
            # 根据格式生成文件
            file_name, file_content = ExternalService._generate_export_file(
                export_data,
                request.format,
                project_id,
                timestamp
            )
            
            file_path = os.path.join(export_dir, file_name)
            
            # 写入文件
            if isinstance(file_content, str):
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(file_content)
            else:
                with open(file_path, 'wb') as f:
                    f.write(file_content)
            
            file_size = os.path.getsize(file_path)
            
            # 记录导出任务
            cursor.execute("""
                INSERT INTO export_jobs (id, project_id, format, status, status_filter, file_path, total_tasks, exported_tasks, completed_at)
                VALUES (?, ?, ?, 'completed', ?, ?, ?, ?, CURRENT_TIMESTAMP)
            """, (
                export_id,
                project_id,
                request.format.value,
                'completed' if request.completed_only else 'all',
                file_path,
                total_exported,
                total_exported
            ))
            
            # 计算过期时间（7天后）
            expires_at = datetime.now() + timedelta(days=7)
            
            return ExternalExportResponse(
                project_id=project_id,
                format=request.format.value,
                total_exported=total_exported,
                file_url=f"/api/exports/{export_id}/download",
                file_name=file_name,
                file_size=file_size,
                expires_at=expires_at
            )
    
    @staticmethod
    def _generate_export_file(
        data: List[Dict],
        format: ExternalExportFormat,
        project_id: str,
        timestamp: str
    ) -> tuple:
        """
        根据格式生成导出文件
        
        Returns:
            tuple: (文件名, 文件内容)
        """
        if format == ExternalExportFormat.JSON:
            return ExternalService._export_json(data, project_id, timestamp)
        elif format == ExternalExportFormat.CSV:
            return ExternalService._export_csv(data, project_id, timestamp)
        elif format == ExternalExportFormat.SHAREGPT:
            return ExternalService._export_sharegpt(data, project_id, timestamp)
        elif format == ExternalExportFormat.YOLO:
            return ExternalService._export_yolo(data, project_id, timestamp)
        elif format == ExternalExportFormat.COCO:
            return ExternalService._export_coco(data, project_id, timestamp)
        elif format == ExternalExportFormat.ALPACA:
            return ExternalService._export_alpaca(data, project_id, timestamp)
        else:
            return ExternalService._export_json(data, project_id, timestamp)
    
    @staticmethod
    def _export_json(data: List[Dict], project_id: str, timestamp: str) -> tuple:
        """导出JSON格式"""
        file_name = f"export_{project_id}_{timestamp}.json"
        content = json.dumps(data, ensure_ascii=False, indent=2)
        return file_name, content
    
    @staticmethod
    def _export_csv(data: List[Dict], project_id: str, timestamp: str) -> tuple:
        """导出CSV格式"""
        import csv
        import io
        
        file_name = f"export_{project_id}_{timestamp}.csv"
        
        output = io.StringIO()
        writer = csv.writer(output)
        
        # 写入表头
        writer.writerow(['task_id', 'external_id', 'status', 'annotator', 'original_data', 'annotations'])
        
        # 写入数据
        for item in data:
            writer.writerow([
                item.get('task_id', ''),
                item.get('external_id', ''),
                item.get('status', ''),
                item.get('annotator', ''),
                json.dumps(item.get('original_data', {}), ensure_ascii=False),
                json.dumps(item.get('annotations', []), ensure_ascii=False)
            ])
        
        return file_name, output.getvalue()
    
    @staticmethod
    def _export_sharegpt(data: List[Dict], project_id: str, timestamp: str) -> tuple:
        """导出ShareGPT对话格式"""
        file_name = f"export_{project_id}_sharegpt_{timestamp}.json"
        
        conversations = []
        for item in data:
            original = item.get('original_data', {})
            annotations = item.get('annotations', [])
            
            # 获取原始文本
            text = original.get('text', original.get('image', ''))
            
            # 获取标注结果
            label = ""
            if annotations:
                for ann in annotations:
                    if isinstance(ann, list):
                        for a in ann:
                            if 'value' in a and 'choices' in a['value']:
                                label = ', '.join(a['value']['choices'])
                                break
                    elif isinstance(ann, dict):
                        if 'value' in ann and 'choices' in ann['value']:
                            label = ', '.join(ann['value']['choices'])
            
            if text and label:
                conversations.append({
                    "conversations": [
                        {"from": "human", "value": text},
                        {"from": "gpt", "value": label}
                    ]
                })
        
        content = json.dumps(conversations, ensure_ascii=False, indent=2)
        return file_name, content
    
    @staticmethod
    def _export_yolo(data: List[Dict], project_id: str, timestamp: str) -> tuple:
        """导出YOLO格式（简化版，返回JSON描述）"""
        file_name = f"export_{project_id}_yolo_{timestamp}.json"
        
        yolo_data = []
        for item in data:
            original = item.get('original_data', {})
            annotations = item.get('annotations', [])
            
            image_url = original.get('image', '')
            boxes = []
            
            for ann in annotations:
                if isinstance(ann, list):
                    for a in ann:
                        if a.get('type') == 'rectanglelabels':
                            value = a.get('value', {})
                            boxes.append({
                                "label": value.get('rectanglelabels', [''])[0],
                                "x": value.get('x', 0) / 100,
                                "y": value.get('y', 0) / 100,
                                "width": value.get('width', 0) / 100,
                                "height": value.get('height', 0) / 100
                            })
            
            if image_url:
                yolo_data.append({
                    "image": image_url,
                    "boxes": boxes
                })
        
        content = json.dumps(yolo_data, ensure_ascii=False, indent=2)
        return file_name, content
    
    @staticmethod
    def _export_coco(data: List[Dict], project_id: str, timestamp: str) -> tuple:
        """导出COCO格式"""
        file_name = f"export_{project_id}_coco_{timestamp}.json"
        
        coco_data = {
            "images": [],
            "annotations": [],
            "categories": []
        }
        
        category_map = {}
        annotation_id = 1
        
        for idx, item in enumerate(data):
            original = item.get('original_data', {})
            annotations = item.get('annotations', [])
            
            image_url = original.get('image', '')
            
            # 添加图像
            coco_data["images"].append({
                "id": idx + 1,
                "file_name": image_url,
                "width": 0,
                "height": 0
            })
            
            # 处理标注
            for ann in annotations:
                if isinstance(ann, list):
                    for a in ann:
                        if a.get('type') == 'rectanglelabels':
                            value = a.get('value', {})
                            label = value.get('rectanglelabels', [''])[0]
                            
                            # 添加类别
                            if label and label not in category_map:
                                cat_id = len(category_map) + 1
                                category_map[label] = cat_id
                                coco_data["categories"].append({
                                    "id": cat_id,
                                    "name": label
                                })
                            
                            if label:
                                coco_data["annotations"].append({
                                    "id": annotation_id,
                                    "image_id": idx + 1,
                                    "category_id": category_map.get(label, 0),
                                    "bbox": [
                                        value.get('x', 0),
                                        value.get('y', 0),
                                        value.get('width', 0),
                                        value.get('height', 0)
                                    ],
                                    "area": value.get('width', 0) * value.get('height', 0),
                                    "iscrowd": 0
                                })
                                annotation_id += 1
        
        content = json.dumps(coco_data, ensure_ascii=False, indent=2)
        return file_name, content
    
    @staticmethod
    def _export_alpaca(data: List[Dict], project_id: str, timestamp: str) -> tuple:
        """导出Alpaca指令微调格式"""
        file_name = f"export_{project_id}_alpaca_{timestamp}.json"
        
        alpaca_data = []
        for item in data:
            original = item.get('original_data', {})
            annotations = item.get('annotations', [])
            
            # 获取原始文本
            text = original.get('text', '')
            
            # 获取标注结果
            label = ""
            if annotations:
                for ann in annotations:
                    if isinstance(ann, list):
                        for a in ann:
                            if 'value' in a and 'choices' in a['value']:
                                label = ', '.join(a['value']['choices'])
                                break
                    elif isinstance(ann, dict):
                        if 'value' in ann and 'choices' in ann['value']:
                            label = ', '.join(ann['value']['choices'])
            
            if text:
                alpaca_data.append({
                    "instruction": "请对以下文本进行分类",
                    "input": text,
                    "output": label or "未标注"
                })
        
        content = json.dumps(alpaca_data, ensure_ascii=False, indent=2)
        return file_name, content