software copyright

2026-03-22 15:24:40 +08:00
parent e303bb868a
commit 60f336e345
155 changed files with 127262 additions and 0 deletions
@@ -0,0 +1,446 @@
+# 自然写手写识别与AI分析引擎软件 V1.0
+# 作文批改接口模块 - AI作文评分与批改建议服务
+
+"""
+作文批改API接口
+提供AI作文评分、多维度分析（结构/语法/内容/修辞）、批改建议生成等功能
+支持小学至初中阶段作文批改，基于大语言模型与NLP分析管道
+"""
+
+import time
+import json
+import logging
+import hashlib
+import re
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass, field
+from enum import Enum
+from fastapi import APIRouter, HTTPException, Depends
+from pydantic import BaseModel, Field, validator
+
+logger = logging.getLogger(__name__)
+
+# ==================== 数据模型定义 ====================
+
+class EssayReviewRequest(BaseModel):
+    """作文批改请求"""
+    text: str = Field(..., min_length=10, max_length=5000, description="作文OCR识别文本")
+    title: Optional[str] = Field(None, description="作文题目")
+    grade: int = Field(3, ge=1, le=9, description="年级(1-9)")
+    genre: str = Field("narrative", description="文体类型: narrative/argumentative/expository/descriptive")
+    max_score: int = Field(100, description="满分值")
+    student_id: Optional[str] = Field(None, description="学生ID")
+    assignment_id: Optional[str] = Field(None, description="作业ID")
+    enable_suggestions: bool = Field(True, description="是否生成修改建议")
+
+    @validator('genre')
+    def validate_genre(cls, v):
+        valid_genres = ['narrative', 'argumentative', 'expository', 'descriptive']
+        if v not in valid_genres:
+            raise ValueError(f'文体类型必须为: {valid_genres}')
+        return v
+
+
+class SentenceError(BaseModel):
+    """句子级错误标注"""
+    sentence: str = Field(..., description="原始句子")
+    error_type: str = Field(..., description="错误类型")
+    suggestion: str = Field(..., description="修改建议")
+    position: int = Field(..., description="句子在原文中的位置索引")
+
+
+class EssayScoreDetail(BaseModel):
+    """作文各维度评分详情"""
+    structure: float = Field(..., description="结构分")
+    grammar: float = Field(..., description="语法分")
+    content: float = Field(..., description="内容分")
+    rhetoric: float = Field(..., description="修辞分")
+    handwriting: Optional[float] = Field(None, description="书写分（如有）")
+
+
+# ==================== 文本分析工具 ====================
+
+class TextAnalyzer:
+    """
+    文本分析工具类
+    提供基础的中文文本分析功能：分句、词频统计、句式分析等
+    """
+
+    # 中文句末标点
+    SENTENCE_ENDINGS = {'。', '！', '？', '……', '；'}
+    # 中文段落标识
+    PARAGRAPH_INDENT = '　　'
+
+    @staticmethod
+    def split_sentences(text: str) -> List[str]:
+        """将文本分割为句子列表"""
+        sentences = []
+        current = ""
+        for char in text:
+            current += char
+            if char in TextAnalyzer.SENTENCE_ENDINGS:
+                if current.strip():
+                    sentences.append(current.strip())
+                current = ""
+        if current.strip():
+            sentences.append(current.strip())
+        return sentences
+
+    @staticmethod
+    def split_paragraphs(text: str) -> List[str]:
+        """将文本分割为段落列表"""
+        # 按换行符分割，过滤空段落
+        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
+        return paragraphs
+
+    @staticmethod
+    def count_characters(text: str) -> Dict[str, int]:
+        """统计文本字符数"""
+        chinese_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
+        punctuation_count = sum(1 for c in text if c in '，。！？、；：""''（）《》……—')
+        total_count = len(text.replace(' ', '').replace('\n', ''))
+        return {
+            "total": total_count,
+            "chinese": chinese_count,
+            "punctuation": punctuation_count
+        }
+
+    @staticmethod
+    def detect_rhetoric(text: str) -> List[Dict]:
+        """
+        检测修辞手法使用情况
+        识别常见修辞：比喻、排比、拟人、夸张等
+        """
+        rhetorics = []
+
+        # 比喻检测：包含"像...一样"、"如同"、"仿佛"等关键词
+        simile_patterns = [
+            r'像.{2,10}一样', r'如同.{2,10}', r'仿佛.{2,10}',
+            r'好像.{2,10}', r'犹如.{2,10}', r'宛如.{2,10}'
+        ]
+        for pattern in simile_patterns:
+            matches = re.finditer(pattern, text)
+            for m in matches:
+                rhetorics.append({
+                    "type": "simile", "name": "比喻",
+                    "text": m.group(), "position": m.start()
+                })
+
+        # 排比检测：连续出现相似句式结构
+        sentences = TextAnalyzer.split_sentences(text)
+        for i in range(len(sentences) - 2):
+            s1, s2, s3 = sentences[i], sentences[i+1], sentences[i+2]
+            # 简化判断：三个连续句子长度相近且首字相同
+            if (abs(len(s1) - len(s2)) < 5 and abs(len(s2) - len(s3)) < 5 and
+                len(s1) > 5 and s1[0] == s2[0] == s3[0]):
+                rhetorics.append({
+                    "type": "parallelism", "name": "排比",
+                    "text": f"{s1}{s2}{s3}", "position": text.find(s1)
+                })
+
+        # 拟人检测：非人事物使用人的动作词
+        personification_patterns = [
+            r'[风雨雪花树草月阳光河水山].{0,3}[笑哭唱跳跑走说叫]',
+            r'[风雨雪花树草月阳光河水山].{0,3}[温柔轻轻悄悄]'
+        ]
+        for pattern in personification_patterns:
+            matches = re.finditer(pattern, text)
+            for m in matches:
+                rhetorics.append({
+                    "type": "personification", "name": "拟人",
+                    "text": m.group(), "position": m.start()
+                })
+
+        return rhetorics
+
+
+# ==================== 作文评分引擎 ====================
+
+class EssayScoringEngine:
+    """
+    作文评分引擎
+    基于多维度分析管道对作文进行综合评分
+    评分维度：结构(25%)、语法(25%)、内容(30%)、修辞(20%)
+    """
+
+    # 各年级期望字数范围
+    EXPECTED_LENGTH = {
+        1: (50, 150), 2: (100, 250), 3: (200, 400),
+        4: (300, 500), 5: (350, 600), 6: (400, 700),
+        7: (500, 800), 8: (600, 900), 9: (600, 1000)
+    }
+
+    # 评分维度权重配置
+    DIMENSION_WEIGHTS = {
+        "structure": 0.25,
+        "grammar": 0.25,
+        "content": 0.30,
+        "rhetoric": 0.20
+    }
+
+    def __init__(self):
+        self._text_analyzer = TextAnalyzer()
+        self._error_patterns = self._load_error_patterns()
+        logger.info("作文评分引擎初始化完成")
+
+    def _load_error_patterns(self) -> List[Dict]:
+        """加载常见语法错误模式库"""
+        return [
+            {"pattern": r"的的", "type": "repetition", "msg": "重复用字'的的'"},
+            {"pattern": r"了了", "type": "repetition", "msg": "重复用字'了了'"},
+            {"pattern": r"因为.{5,50}因为", "type": "logic", "msg": "重复使用'因为'，建议精简"},
+            {"pattern": r"然后.{3,20}然后.{3,20}然后", "type": "style", "msg": "过度使用'然后'连接"},
+            {"pattern": r"非常非常", "type": "repetition", "msg": "重复使用'非常'"},
+            {"pattern": r"[，]{3,}", "type": "punctuation", "msg": "连续使用多个逗号，建议使用句号断句"},
+        ]
+
+    def score_structure(self, text: str, grade: int) -> Tuple[float, List[str]]:
+        """
+        评估文章结构（满分100）
+        检查：段落划分、开头结尾完整性、字数是否达标、层次是否清晰
+        """
+        comments = []
+        score = 100.0
+
+        paragraphs = self._text_analyzer.split_paragraphs(text)
+        char_stats = self._text_analyzer.count_characters(text)
+
+        # 段落数评估（期望3-8段）
+        if len(paragraphs) < 2:
+            score -= 25
+            comments.append("文章缺少段落划分，建议分段书写使结构更清晰")
+        elif len(paragraphs) < 3:
+            score -= 10
+            comments.append("段落较少，建议增加过渡段落")
+
+        # 字数评估
+        expected = self.EXPECTED_LENGTH.get(grade, (300, 600))
+        if char_stats["chinese"] < expected[0]:
+            deficit = expected[0] - char_stats["chinese"]
+            score -= min(30, deficit // 10)
+            comments.append(f"字数偏少（{char_stats['chinese']}字），该年级建议{expected[0]}-{expected[1]}字")
+        elif char_stats["chinese"] > expected[1] * 1.5:
+            score -= 5
+            comments.append("字数偏多，建议精简语句突出重点")
+
+        # 开头结尾评估
+        if paragraphs:
+            first_para = paragraphs[0]
+            last_para = paragraphs[-1]
+            if len(first_para) < 15:
+                score -= 10
+                comments.append("开头过于简短，建议丰富开篇引入")
+            if len(last_para) < 10:
+                score -= 10
+                comments.append("结尾过于简短，建议加强收束呼应主题")
+
+        return max(0, score), comments
+
+    def score_grammar(self, text: str) -> Tuple[float, List[SentenceError]]:
+        """
+        评估语法正确性（满分100）
+        检查：常见语病、标点使用、词语搭配
+        """
+        errors = []
+        score = 100.0
+
+        # 使用预定义的错误模式进行匹配检测
+        for ep in self._error_patterns:
+            matches = re.finditer(ep["pattern"], text)
+            for m in matches:
+                errors.append(SentenceError(
+                    sentence=m.group(),
+                    error_type=ep["type"],
+                    suggestion=ep["msg"],
+                    position=m.start()
+                ))
+                score -= 5  # 每个语法错误扣5分
+
+        # 检查句子长度（过长的句子可能有语病）
+        sentences = self._text_analyzer.split_sentences(text)
+        for i, s in enumerate(sentences):
+            if len(s) > 80:
+                errors.append(SentenceError(
+                    sentence=s[:30] + "...",
+                    error_type="long_sentence",
+                    suggestion="句子过长，建议拆分为多个短句以提高可读性",
+                    position=text.find(s)
+                ))
+                score -= 3
+
+        return max(0, score), errors
+
+    def score_content(self, text: str, title: Optional[str], genre: str, grade: int) -> Tuple[float, List[str]]:
+        """
+        评估内容质量（满分100）
+        检查：主题相关性、内容丰富度、逻辑连贯性、情感表达
+        """
+        comments = []
+        score = 85.0  # 基础分（内容难以精确量化，给予较高基础分）
+
+        char_stats = self._text_analyzer.count_characters(text)
+        sentences = self._text_analyzer.split_sentences(text)
+
+        # 内容丰富度：通过不同词汇的数量粗略评估
+        unique_chars = set(c for c in text if '\u4e00' <= c <= '\u9fff')
+        vocab_richness = len(unique_chars) / max(char_stats["chinese"], 1)
+        if vocab_richness > 0.6:
+            score += 10
+            comments.append("词汇丰富，用词多样化")
+        elif vocab_richness < 0.3:
+            score -= 10
+            comments.append("词汇较为单一，建议使用更丰富的词语表达")
+
+        # 逻辑连贯性：检查是否使用连接词
+        connectors = ['因此', '所以', '但是', '然而', '首先', '其次', '最后', '总之',
+                       '不仅', '而且', '虽然', '但', '因为', '于是']
+        used_connectors = [c for c in connectors if c in text]
+        if len(used_connectors) >= 3:
+            score += 5
+            comments.append("逻辑衔接词使用恰当，行文连贯")
+        elif len(used_connectors) == 0 and len(sentences) > 5:
+            score -= 5
+            comments.append("缺少逻辑连接词，建议增加过渡衔接使行文更连贯")
+
+        # 情感表达评估
+        emotion_words = ['开心', '快乐', '高兴', '感动', '难过', '伤心', '惊讶',
+                          '温暖', '幸福', '骄傲', '担心', '紧张']
+        used_emotions = [w for w in emotion_words if w in text]
+        if used_emotions:
+            score += 3
+            comments.append("有恰当的情感表达，增强了文章感染力")
+
+        return min(100, max(0, score)), comments
+
+    def score_rhetoric(self, text: str, grade: int) -> Tuple[float, List[str]]:
+        """
+        评估修辞运用（满分100）
+        检查：修辞手法的使用数量和质量
+        """
+        comments = []
+        score = 70.0  # 基础分
+
+        rhetorics = self._text_analyzer.detect_rhetoric(text)
+
+        # 根据检测到的修辞数量加分
+        rhetoric_types = set(r["type"] for r in rhetorics)
+        if len(rhetoric_types) >= 3:
+            score += 25
+            comments.append(f"修辞手法运用丰富，使用了{len(rhetoric_types)}种修辞手法")
+        elif len(rhetoric_types) >= 1:
+            score += 15
+            used_names = set(r["name"] for r in rhetorics)
+            comments.append(f"使用了{'、'.join(used_names)}等修辞手法")
+        else:
+            comments.append("建议适当使用比喻、排比等修辞手法增强表达效果")
+
+        # 高年级对修辞有更高要求
+        if grade >= 5 and len(rhetoric_types) < 2:
+            score -= 10
+            comments.append("该年级建议至少使用2种以上修辞手法")
+
+        return min(100, max(0, score)), comments
+
+    def review_essay(self, request: EssayReviewRequest) -> Dict:
+        """
+        综合批改作文，返回总分和各维度分析结果
+        """
+        start_time = time.time()
+
+        # 各维度独立评分
+        struct_score, struct_comments = self.score_structure(request.text, request.grade)
+        grammar_score, grammar_errors = self.score_grammar(request.text)
+        content_score, content_comments = self.score_content(
+            request.text, request.title, request.genre, request.grade)
+        rhetoric_score, rhetoric_comments = self.score_rhetoric(request.text, request.grade)
+
+        # 按权重计算总分，并映射到满分值
+        weighted_score = (
+            struct_score * self.DIMENSION_WEIGHTS["structure"] +
+            grammar_score * self.DIMENSION_WEIGHTS["grammar"] +
+            content_score * self.DIMENSION_WEIGHTS["content"] +
+            rhetoric_score * self.DIMENSION_WEIGHTS["rhetoric"]
+        )
+        total_score = round(weighted_score / 100 * request.max_score, 1)
+
+        # 字数统计
+        char_stats = TextAnalyzer.count_characters(request.text)
+
+        # 生成综合评语
+        overall_comment = self._generate_overall_comment(
+            total_score, request.max_score, struct_comments,
+            content_comments, rhetoric_comments
+        )
+
+        elapsed = (time.time() - start_time) * 1000
+
+        result = {
+            "total_score": total_score,
+            "max_score": request.max_score,
+            "dimensions": {
+                "structure": round(struct_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["structure"], 1),
+                "grammar": round(grammar_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["grammar"], 1),
+                "content": round(content_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["content"], 1),
+                "rhetoric": round(rhetoric_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["rhetoric"], 1),
+            },
+            "character_count": char_stats,
+            "overall_comment": overall_comment,
+            "structure_analysis": struct_comments,
+            "content_analysis": content_comments,
+            "rhetoric_analysis": rhetoric_comments,
+            "grammar_errors": [e.dict() for e in grammar_errors] if request.enable_suggestions else [],
+            "inference_time_ms": round(elapsed, 2)
+        }
+        return result
+
+    def _generate_overall_comment(self, score: float, max_score: int,
+                                    struct_comments: List, content_comments: List,
+                                    rhetoric_comments: List) -> str:
+        """生成综合评语"""
+        ratio = score / max_score
+        if ratio >= 0.9:
+            prefix = "优秀！"
+        elif ratio >= 0.75:
+            prefix = "良好。"
+        elif ratio >= 0.6:
+            prefix = "中等。"
+        else:
+            prefix = "需要加强。"
+
+        suggestions = []
+        if struct_comments:
+            suggestions.append(struct_comments[0])
+        if content_comments:
+            suggestions.append(content_comments[0])
+        if rhetoric_comments:
+            suggestions.append(rhetoric_comments[0])
+
+        return f"{prefix}{'；'.join(suggestions[:3])}"
+
+
+# ==================== API路由定义 ====================
+
+router = APIRouter(prefix="/api/v1", tags=["作文批改"])
+_scoring_engine = EssayScoringEngine()
+
+
+@router.post("/essay/review")
+async def review_essay(request: EssayReviewRequest):
+    """
+    AI作文评分与批改接口
+    POST /api/v1/essay/review
+    输入作文OCR识别文本，返回综合评分、各维度分析和修改建议
+    """
+    try:
+        result = _scoring_engine.review_essay(request)
+
+        # 审计日志记录
+        logger.info(
+            f"作文批改完成: score={result['total_score']}/{request.max_score}, "
+            f"student={request.student_id}, assignment={request.assignment_id}, "
+            f"chars={result['character_count']['chinese']}, time={result['inference_time_ms']}ms"
+        )
+        return {"code": 200, "msg": "success", "data": result}
+    except Exception as e:
+        logger.error(f"作文批改异常: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"作文批改服务异常: {str(e)}")