system-design/software-copyright/02-writech-ai-engine/api/essay_api.py

# 自然写手写识别与AI分析引擎软件 V1.0
# 作文批改接口模块 - AI作文评分与批改建议服务

"""
作文批改API接口
提供AI作文评分、多维度分析（结构/语法/内容/修辞）、批改建议生成等功能
支持小学至初中阶段作文批改，基于大语言模型与NLP分析管道
"""

import time
import json
import logging
import hashlib
import re
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, field
from enum import Enum
from fastapi import APIRouter, HTTPException, Depends
from pydantic import BaseModel, Field, validator

logger = logging.getLogger(__name__)

# ==================== 数据模型定义 ====================

class EssayReviewRequest(BaseModel):
    """作文批改请求"""
    text: str = Field(..., min_length=10, max_length=5000, description="作文OCR识别文本")
    title: Optional[str] = Field(None, description="作文题目")
    grade: int = Field(3, ge=1, le=9, description="年级(1-9)")
    genre: str = Field("narrative", description="文体类型: narrative/argumentative/expository/descriptive")
    max_score: int = Field(100, description="满分值")
    student_id: Optional[str] = Field(None, description="学生ID")
    assignment_id: Optional[str] = Field(None, description="作业ID")
    enable_suggestions: bool = Field(True, description="是否生成修改建议")

    @validator('genre')
    def validate_genre(cls, v):
        valid_genres = ['narrative', 'argumentative', 'expository', 'descriptive']
        if v not in valid_genres:
            raise ValueError(f'文体类型必须为: {valid_genres}')
        return v


class SentenceError(BaseModel):
    """句子级错误标注"""
    sentence: str = Field(..., description="原始句子")
    error_type: str = Field(..., description="错误类型")
    suggestion: str = Field(..., description="修改建议")
    position: int = Field(..., description="句子在原文中的位置索引")


class EssayScoreDetail(BaseModel):
    """作文各维度评分详情"""
    structure: float = Field(..., description="结构分")
    grammar: float = Field(..., description="语法分")
    content: float = Field(..., description="内容分")
    rhetoric: float = Field(..., description="修辞分")
    handwriting: Optional[float] = Field(None, description="书写分（如有）")


# ==================== 文本分析工具 ====================

class TextAnalyzer:
    """
    文本分析工具类
    提供基础的中文文本分析功能：分句、词频统计、句式分析等
    """

    # 中文句末标点
    SENTENCE_ENDINGS = {'。', '！', '？', '……', '；'}
    # 中文段落标识
    PARAGRAPH_INDENT = '　　'

    @staticmethod
    def split_sentences(text: str) -> List[str]:
        """将文本分割为句子列表"""
        sentences = []
        current = ""
        for char in text:
            current += char
            if char in TextAnalyzer.SENTENCE_ENDINGS:
                if current.strip():
                    sentences.append(current.strip())
                current = ""
        if current.strip():
            sentences.append(current.strip())
        return sentences

    @staticmethod
    def split_paragraphs(text: str) -> List[str]:
        """将文本分割为段落列表"""
        # 按换行符分割，过滤空段落
        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
        return paragraphs

    @staticmethod
    def count_characters(text: str) -> Dict[str, int]:
        """统计文本字符数"""
        chinese_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
        punctuation_count = sum(1 for c in text if c in '，。！？、；：""''（）《》……—')
        total_count = len(text.replace(' ', '').replace('\n', ''))
        return {
            "total": total_count,
            "chinese": chinese_count,
            "punctuation": punctuation_count
        }

    @staticmethod
    def detect_rhetoric(text: str) -> List[Dict]:
        """
        检测修辞手法使用情况
        识别常见修辞：比喻、排比、拟人、夸张等
        """
        rhetorics = []

        # 比喻检测：包含"像...一样"、"如同"、"仿佛"等关键词
        simile_patterns = [
            r'像.{2,10}一样', r'如同.{2,10}', r'仿佛.{2,10}',
            r'好像.{2,10}', r'犹如.{2,10}', r'宛如.{2,10}'
        ]
        for pattern in simile_patterns:
            matches = re.finditer(pattern, text)
            for m in matches:
                rhetorics.append({
                    "type": "simile", "name": "比喻",
                    "text": m.group(), "position": m.start()
                })

        # 排比检测：连续出现相似句式结构
        sentences = TextAnalyzer.split_sentences(text)
        for i in range(len(sentences) - 2):
            s1, s2, s3 = sentences[i], sentences[i+1], sentences[i+2]
            # 简化判断：三个连续句子长度相近且首字相同
            if (abs(len(s1) - len(s2)) < 5 and abs(len(s2) - len(s3)) < 5 and
                len(s1) > 5 and s1[0] == s2[0] == s3[0]):
                rhetorics.append({
                    "type": "parallelism", "name": "排比",
                    "text": f"{s1}{s2}{s3}", "position": text.find(s1)
                })

        # 拟人检测：非人事物使用人的动作词
        personification_patterns = [
            r'[风雨雪花树草月阳光河水山].{0,3}[笑哭唱跳跑走说叫]',
            r'[风雨雪花树草月阳光河水山].{0,3}[温柔轻轻悄悄]'
        ]
        for pattern in personification_patterns:
            matches = re.finditer(pattern, text)
            for m in matches:
                rhetorics.append({
                    "type": "personification", "name": "拟人",
                    "text": m.group(), "position": m.start()
                })

        return rhetorics


# ==================== 作文评分引擎 ====================

class EssayScoringEngine:
    """
    作文评分引擎
    基于多维度分析管道对作文进行综合评分
    评分维度：结构(25%)、语法(25%)、内容(30%)、修辞(20%)
    """

    # 各年级期望字数范围
    EXPECTED_LENGTH = {
        1: (50, 150), 2: (100, 250), 3: (200, 400),
        4: (300, 500), 5: (350, 600), 6: (400, 700),
        7: (500, 800), 8: (600, 900), 9: (600, 1000)
    }

    # 评分维度权重配置
    DIMENSION_WEIGHTS = {
        "structure": 0.25,
        "grammar": 0.25,
        "content": 0.30,
        "rhetoric": 0.20
    }

    def __init__(self):
        self._text_analyzer = TextAnalyzer()
        self._error_patterns = self._load_error_patterns()
        logger.info("作文评分引擎初始化完成")

    def _load_error_patterns(self) -> List[Dict]:
        """加载常见语法错误模式库"""
        return [
            {"pattern": r"的的", "type": "repetition", "msg": "重复用字'的的'"},
            {"pattern": r"了了", "type": "repetition", "msg": "重复用字'了了'"},
            {"pattern": r"因为.{5,50}因为", "type": "logic", "msg": "重复使用'因为'，建议精简"},
            {"pattern": r"然后.{3,20}然后.{3,20}然后", "type": "style", "msg": "过度使用'然后'连接"},
            {"pattern": r"非常非常", "type": "repetition", "msg": "重复使用'非常'"},
            {"pattern": r"[，]{3,}", "type": "punctuation", "msg": "连续使用多个逗号，建议使用句号断句"},
        ]

    def score_structure(self, text: str, grade: int) -> Tuple[float, List[str]]:
        """
        评估文章结构（满分100）
        检查：段落划分、开头结尾完整性、字数是否达标、层次是否清晰
        """
        comments = []
        score = 100.0

        paragraphs = self._text_analyzer.split_paragraphs(text)
        char_stats = self._text_analyzer.count_characters(text)

        # 段落数评估（期望3-8段）
        if len(paragraphs) < 2:
            score -= 25
            comments.append("文章缺少段落划分，建议分段书写使结构更清晰")
        elif len(paragraphs) < 3:
            score -= 10
            comments.append("段落较少，建议增加过渡段落")

        # 字数评估
        expected = self.EXPECTED_LENGTH.get(grade, (300, 600))
        if char_stats["chinese"] < expected[0]:
            deficit = expected[0] - char_stats["chinese"]
            score -= min(30, deficit // 10)
            comments.append(f"字数偏少（{char_stats['chinese']}字），该年级建议{expected[0]}-{expected[1]}字")
        elif char_stats["chinese"] > expected[1] * 1.5:
            score -= 5
            comments.append("字数偏多，建议精简语句突出重点")

        # 开头结尾评估
        if paragraphs:
            first_para = paragraphs[0]
            last_para = paragraphs[-1]
            if len(first_para) < 15:
                score -= 10
                comments.append("开头过于简短，建议丰富开篇引入")
            if len(last_para) < 10:
                score -= 10
                comments.append("结尾过于简短，建议加强收束呼应主题")

        return max(0, score), comments

    def score_grammar(self, text: str) -> Tuple[float, List[SentenceError]]:
        """
        评估语法正确性（满分100）
        检查：常见语病、标点使用、词语搭配
        """
        errors = []
        score = 100.0

        # 使用预定义的错误模式进行匹配检测
        for ep in self._error_patterns:
            matches = re.finditer(ep["pattern"], text)
            for m in matches:
                errors.append(SentenceError(
                    sentence=m.group(),
                    error_type=ep["type"],
                    suggestion=ep["msg"],
                    position=m.start()
                ))
                score -= 5  # 每个语法错误扣5分

        # 检查句子长度（过长的句子可能有语病）
        sentences = self._text_analyzer.split_sentences(text)
        for i, s in enumerate(sentences):
            if len(s) > 80:
                errors.append(SentenceError(
                    sentence=s[:30] + "...",
                    error_type="long_sentence",
                    suggestion="句子过长，建议拆分为多个短句以提高可读性",
                    position=text.find(s)
                ))
                score -= 3

        return max(0, score), errors

    def score_content(self, text: str, title: Optional[str], genre: str, grade: int) -> Tuple[float, List[str]]:
        """
        评估内容质量（满分100）
        检查：主题相关性、内容丰富度、逻辑连贯性、情感表达
        """
        comments = []
        score = 85.0  # 基础分（内容难以精确量化，给予较高基础分）

        char_stats = self._text_analyzer.count_characters(text)
        sentences = self._text_analyzer.split_sentences(text)

        # 内容丰富度：通过不同词汇的数量粗略评估
        unique_chars = set(c for c in text if '\u4e00' <= c <= '\u9fff')
        vocab_richness = len(unique_chars) / max(char_stats["chinese"], 1)
        if vocab_richness > 0.6:
            score += 10
            comments.append("词汇丰富，用词多样化")
        elif vocab_richness < 0.3:
            score -= 10
            comments.append("词汇较为单一，建议使用更丰富的词语表达")

        # 逻辑连贯性：检查是否使用连接词
        connectors = ['因此', '所以', '但是', '然而', '首先', '其次', '最后', '总之',
                       '不仅', '而且', '虽然', '但', '因为', '于是']
        used_connectors = [c for c in connectors if c in text]
        if len(used_connectors) >= 3:
            score += 5
            comments.append("逻辑衔接词使用恰当，行文连贯")
        elif len(used_connectors) == 0 and len(sentences) > 5:
            score -= 5
            comments.append("缺少逻辑连接词，建议增加过渡衔接使行文更连贯")

        # 情感表达评估
        emotion_words = ['开心', '快乐', '高兴', '感动', '难过', '伤心', '惊讶',
                          '温暖', '幸福', '骄傲', '担心', '紧张']
        used_emotions = [w for w in emotion_words if w in text]
        if used_emotions:
            score += 3
            comments.append("有恰当的情感表达，增强了文章感染力")

        return min(100, max(0, score)), comments

    def score_rhetoric(self, text: str, grade: int) -> Tuple[float, List[str]]:
        """
        评估修辞运用（满分100）
        检查：修辞手法的使用数量和质量
        """
        comments = []
        score = 70.0  # 基础分

        rhetorics = self._text_analyzer.detect_rhetoric(text)

        # 根据检测到的修辞数量加分
        rhetoric_types = set(r["type"] for r in rhetorics)
        if len(rhetoric_types) >= 3:
            score += 25
            comments.append(f"修辞手法运用丰富，使用了{len(rhetoric_types)}种修辞手法")
        elif len(rhetoric_types) >= 1:
            score += 15
            used_names = set(r["name"] for r in rhetorics)
            comments.append(f"使用了{'、'.join(used_names)}等修辞手法")
        else:
            comments.append("建议适当使用比喻、排比等修辞手法增强表达效果")

        # 高年级对修辞有更高要求
        if grade >= 5 and len(rhetoric_types) < 2:
            score -= 10
            comments.append("该年级建议至少使用2种以上修辞手法")

        return min(100, max(0, score)), comments

    def review_essay(self, request: EssayReviewRequest) -> Dict:
        """
        综合批改作文，返回总分和各维度分析结果
        """
        start_time = time.time()

        # 各维度独立评分
        struct_score, struct_comments = self.score_structure(request.text, request.grade)
        grammar_score, grammar_errors = self.score_grammar(request.text)
        content_score, content_comments = self.score_content(
            request.text, request.title, request.genre, request.grade)
        rhetoric_score, rhetoric_comments = self.score_rhetoric(request.text, request.grade)

        # 按权重计算总分，并映射到满分值
        weighted_score = (
            struct_score * self.DIMENSION_WEIGHTS["structure"] +
            grammar_score * self.DIMENSION_WEIGHTS["grammar"] +
            content_score * self.DIMENSION_WEIGHTS["content"] +
            rhetoric_score * self.DIMENSION_WEIGHTS["rhetoric"]
        )
        total_score = round(weighted_score / 100 * request.max_score, 1)

        # 字数统计
        char_stats = TextAnalyzer.count_characters(request.text)

        # 生成综合评语
        overall_comment = self._generate_overall_comment(
            total_score, request.max_score, struct_comments,
            content_comments, rhetoric_comments
        )

        elapsed = (time.time() - start_time) * 1000

        result = {
            "total_score": total_score,
            "max_score": request.max_score,
            "dimensions": {
                "structure": round(struct_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["structure"], 1),
                "grammar": round(grammar_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["grammar"], 1),
                "content": round(content_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["content"], 1),
                "rhetoric": round(rhetoric_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["rhetoric"], 1),
            },
            "character_count": char_stats,
            "overall_comment": overall_comment,
            "structure_analysis": struct_comments,
            "content_analysis": content_comments,
            "rhetoric_analysis": rhetoric_comments,
            "grammar_errors": [e.dict() for e in grammar_errors] if request.enable_suggestions else [],
            "inference_time_ms": round(elapsed, 2)
        }
        return result

    def _generate_overall_comment(self, score: float, max_score: int,
                                    struct_comments: List, content_comments: List,
                                    rhetoric_comments: List) -> str:
        """生成综合评语"""
        ratio = score / max_score
        if ratio >= 0.9:
            prefix = "优秀！"
        elif ratio >= 0.75:
            prefix = "良好。"
        elif ratio >= 0.6:
            prefix = "中等。"
        else:
            prefix = "需要加强。"

        suggestions = []
        if struct_comments:
            suggestions.append(struct_comments[0])
        if content_comments:
            suggestions.append(content_comments[0])
        if rhetoric_comments:
            suggestions.append(rhetoric_comments[0])

        return f"{prefix}{'；'.join(suggestions[:3])}"


# ==================== API路由定义 ====================

router = APIRouter(prefix="/api/v1", tags=["作文批改"])
_scoring_engine = EssayScoringEngine()


@router.post("/essay/review")
async def review_essay(request: EssayReviewRequest):
    """
    AI作文评分与批改接口
    POST /api/v1/essay/review
    输入作文OCR识别文本，返回综合评分、各维度分析和修改建议
    """
    try:
        result = _scoring_engine.review_essay(request)

        # 审计日志记录
        logger.info(
            f"作文批改完成: score={result['total_score']}/{request.max_score}, "
            f"student={request.student_id}, assignment={request.assignment_id}, "
            f"chars={result['character_count']['chinese']}, time={result['inference_time_ms']}ms"
        )
        return {"code": 200, "msg": "success", "data": result}
    except Exception as e:
        logger.error(f"作文批改异常: {str(e)}")
        raise HTTPException(status_code=500, detail=f"作文批改服务异常: {str(e)}")