447 lines
18 KiB
Python
447 lines
18 KiB
Python
# 自然写手写识别与AI分析引擎软件 V1.0
|
||
# 作文批改接口模块 - AI作文评分与批改建议服务
|
||
|
||
"""
|
||
作文批改API接口
|
||
提供AI作文评分、多维度分析(结构/语法/内容/修辞)、批改建议生成等功能
|
||
支持小学至初中阶段作文批改,基于大语言模型与NLP分析管道
|
||
"""
|
||
|
||
import time
|
||
import json
|
||
import logging
|
||
import hashlib
|
||
import re
|
||
from typing import List, Dict, Optional, Tuple
|
||
from dataclasses import dataclass, field
|
||
from enum import Enum
|
||
from fastapi import APIRouter, HTTPException, Depends
|
||
from pydantic import BaseModel, Field, validator
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ==================== 数据模型定义 ====================
|
||
|
||
class EssayReviewRequest(BaseModel):
|
||
"""作文批改请求"""
|
||
text: str = Field(..., min_length=10, max_length=5000, description="作文OCR识别文本")
|
||
title: Optional[str] = Field(None, description="作文题目")
|
||
grade: int = Field(3, ge=1, le=9, description="年级(1-9)")
|
||
genre: str = Field("narrative", description="文体类型: narrative/argumentative/expository/descriptive")
|
||
max_score: int = Field(100, description="满分值")
|
||
student_id: Optional[str] = Field(None, description="学生ID")
|
||
assignment_id: Optional[str] = Field(None, description="作业ID")
|
||
enable_suggestions: bool = Field(True, description="是否生成修改建议")
|
||
|
||
@validator('genre')
|
||
def validate_genre(cls, v):
|
||
valid_genres = ['narrative', 'argumentative', 'expository', 'descriptive']
|
||
if v not in valid_genres:
|
||
raise ValueError(f'文体类型必须为: {valid_genres}')
|
||
return v
|
||
|
||
|
||
class SentenceError(BaseModel):
|
||
"""句子级错误标注"""
|
||
sentence: str = Field(..., description="原始句子")
|
||
error_type: str = Field(..., description="错误类型")
|
||
suggestion: str = Field(..., description="修改建议")
|
||
position: int = Field(..., description="句子在原文中的位置索引")
|
||
|
||
|
||
class EssayScoreDetail(BaseModel):
|
||
"""作文各维度评分详情"""
|
||
structure: float = Field(..., description="结构分")
|
||
grammar: float = Field(..., description="语法分")
|
||
content: float = Field(..., description="内容分")
|
||
rhetoric: float = Field(..., description="修辞分")
|
||
handwriting: Optional[float] = Field(None, description="书写分(如有)")
|
||
|
||
|
||
# ==================== 文本分析工具 ====================
|
||
|
||
class TextAnalyzer:
|
||
"""
|
||
文本分析工具类
|
||
提供基础的中文文本分析功能:分句、词频统计、句式分析等
|
||
"""
|
||
|
||
# 中文句末标点
|
||
SENTENCE_ENDINGS = {'。', '!', '?', '……', ';'}
|
||
# 中文段落标识
|
||
PARAGRAPH_INDENT = ' '
|
||
|
||
@staticmethod
|
||
def split_sentences(text: str) -> List[str]:
|
||
"""将文本分割为句子列表"""
|
||
sentences = []
|
||
current = ""
|
||
for char in text:
|
||
current += char
|
||
if char in TextAnalyzer.SENTENCE_ENDINGS:
|
||
if current.strip():
|
||
sentences.append(current.strip())
|
||
current = ""
|
||
if current.strip():
|
||
sentences.append(current.strip())
|
||
return sentences
|
||
|
||
@staticmethod
|
||
def split_paragraphs(text: str) -> List[str]:
|
||
"""将文本分割为段落列表"""
|
||
# 按换行符分割,过滤空段落
|
||
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
|
||
return paragraphs
|
||
|
||
@staticmethod
|
||
def count_characters(text: str) -> Dict[str, int]:
|
||
"""统计文本字符数"""
|
||
chinese_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
|
||
punctuation_count = sum(1 for c in text if c in ',。!?、;:""''()《》……—')
|
||
total_count = len(text.replace(' ', '').replace('\n', ''))
|
||
return {
|
||
"total": total_count,
|
||
"chinese": chinese_count,
|
||
"punctuation": punctuation_count
|
||
}
|
||
|
||
@staticmethod
|
||
def detect_rhetoric(text: str) -> List[Dict]:
|
||
"""
|
||
检测修辞手法使用情况
|
||
识别常见修辞:比喻、排比、拟人、夸张等
|
||
"""
|
||
rhetorics = []
|
||
|
||
# 比喻检测:包含"像...一样"、"如同"、"仿佛"等关键词
|
||
simile_patterns = [
|
||
r'像.{2,10}一样', r'如同.{2,10}', r'仿佛.{2,10}',
|
||
r'好像.{2,10}', r'犹如.{2,10}', r'宛如.{2,10}'
|
||
]
|
||
for pattern in simile_patterns:
|
||
matches = re.finditer(pattern, text)
|
||
for m in matches:
|
||
rhetorics.append({
|
||
"type": "simile", "name": "比喻",
|
||
"text": m.group(), "position": m.start()
|
||
})
|
||
|
||
# 排比检测:连续出现相似句式结构
|
||
sentences = TextAnalyzer.split_sentences(text)
|
||
for i in range(len(sentences) - 2):
|
||
s1, s2, s3 = sentences[i], sentences[i+1], sentences[i+2]
|
||
# 简化判断:三个连续句子长度相近且首字相同
|
||
if (abs(len(s1) - len(s2)) < 5 and abs(len(s2) - len(s3)) < 5 and
|
||
len(s1) > 5 and s1[0] == s2[0] == s3[0]):
|
||
rhetorics.append({
|
||
"type": "parallelism", "name": "排比",
|
||
"text": f"{s1}{s2}{s3}", "position": text.find(s1)
|
||
})
|
||
|
||
# 拟人检测:非人事物使用人的动作词
|
||
personification_patterns = [
|
||
r'[风雨雪花树草月阳光河水山].{0,3}[笑哭唱跳跑走说叫]',
|
||
r'[风雨雪花树草月阳光河水山].{0,3}[温柔轻轻悄悄]'
|
||
]
|
||
for pattern in personification_patterns:
|
||
matches = re.finditer(pattern, text)
|
||
for m in matches:
|
||
rhetorics.append({
|
||
"type": "personification", "name": "拟人",
|
||
"text": m.group(), "position": m.start()
|
||
})
|
||
|
||
return rhetorics
|
||
|
||
|
||
# ==================== 作文评分引擎 ====================
|
||
|
||
class EssayScoringEngine:
|
||
"""
|
||
作文评分引擎
|
||
基于多维度分析管道对作文进行综合评分
|
||
评分维度:结构(25%)、语法(25%)、内容(30%)、修辞(20%)
|
||
"""
|
||
|
||
# 各年级期望字数范围
|
||
EXPECTED_LENGTH = {
|
||
1: (50, 150), 2: (100, 250), 3: (200, 400),
|
||
4: (300, 500), 5: (350, 600), 6: (400, 700),
|
||
7: (500, 800), 8: (600, 900), 9: (600, 1000)
|
||
}
|
||
|
||
# 评分维度权重配置
|
||
DIMENSION_WEIGHTS = {
|
||
"structure": 0.25,
|
||
"grammar": 0.25,
|
||
"content": 0.30,
|
||
"rhetoric": 0.20
|
||
}
|
||
|
||
def __init__(self):
|
||
self._text_analyzer = TextAnalyzer()
|
||
self._error_patterns = self._load_error_patterns()
|
||
logger.info("作文评分引擎初始化完成")
|
||
|
||
def _load_error_patterns(self) -> List[Dict]:
|
||
"""加载常见语法错误模式库"""
|
||
return [
|
||
{"pattern": r"的的", "type": "repetition", "msg": "重复用字'的的'"},
|
||
{"pattern": r"了了", "type": "repetition", "msg": "重复用字'了了'"},
|
||
{"pattern": r"因为.{5,50}因为", "type": "logic", "msg": "重复使用'因为',建议精简"},
|
||
{"pattern": r"然后.{3,20}然后.{3,20}然后", "type": "style", "msg": "过度使用'然后'连接"},
|
||
{"pattern": r"非常非常", "type": "repetition", "msg": "重复使用'非常'"},
|
||
{"pattern": r"[,]{3,}", "type": "punctuation", "msg": "连续使用多个逗号,建议使用句号断句"},
|
||
]
|
||
|
||
def score_structure(self, text: str, grade: int) -> Tuple[float, List[str]]:
|
||
"""
|
||
评估文章结构(满分100)
|
||
检查:段落划分、开头结尾完整性、字数是否达标、层次是否清晰
|
||
"""
|
||
comments = []
|
||
score = 100.0
|
||
|
||
paragraphs = self._text_analyzer.split_paragraphs(text)
|
||
char_stats = self._text_analyzer.count_characters(text)
|
||
|
||
# 段落数评估(期望3-8段)
|
||
if len(paragraphs) < 2:
|
||
score -= 25
|
||
comments.append("文章缺少段落划分,建议分段书写使结构更清晰")
|
||
elif len(paragraphs) < 3:
|
||
score -= 10
|
||
comments.append("段落较少,建议增加过渡段落")
|
||
|
||
# 字数评估
|
||
expected = self.EXPECTED_LENGTH.get(grade, (300, 600))
|
||
if char_stats["chinese"] < expected[0]:
|
||
deficit = expected[0] - char_stats["chinese"]
|
||
score -= min(30, deficit // 10)
|
||
comments.append(f"字数偏少({char_stats['chinese']}字),该年级建议{expected[0]}-{expected[1]}字")
|
||
elif char_stats["chinese"] > expected[1] * 1.5:
|
||
score -= 5
|
||
comments.append("字数偏多,建议精简语句突出重点")
|
||
|
||
# 开头结尾评估
|
||
if paragraphs:
|
||
first_para = paragraphs[0]
|
||
last_para = paragraphs[-1]
|
||
if len(first_para) < 15:
|
||
score -= 10
|
||
comments.append("开头过于简短,建议丰富开篇引入")
|
||
if len(last_para) < 10:
|
||
score -= 10
|
||
comments.append("结尾过于简短,建议加强收束呼应主题")
|
||
|
||
return max(0, score), comments
|
||
|
||
def score_grammar(self, text: str) -> Tuple[float, List[SentenceError]]:
|
||
"""
|
||
评估语法正确性(满分100)
|
||
检查:常见语病、标点使用、词语搭配
|
||
"""
|
||
errors = []
|
||
score = 100.0
|
||
|
||
# 使用预定义的错误模式进行匹配检测
|
||
for ep in self._error_patterns:
|
||
matches = re.finditer(ep["pattern"], text)
|
||
for m in matches:
|
||
errors.append(SentenceError(
|
||
sentence=m.group(),
|
||
error_type=ep["type"],
|
||
suggestion=ep["msg"],
|
||
position=m.start()
|
||
))
|
||
score -= 5 # 每个语法错误扣5分
|
||
|
||
# 检查句子长度(过长的句子可能有语病)
|
||
sentences = self._text_analyzer.split_sentences(text)
|
||
for i, s in enumerate(sentences):
|
||
if len(s) > 80:
|
||
errors.append(SentenceError(
|
||
sentence=s[:30] + "...",
|
||
error_type="long_sentence",
|
||
suggestion="句子过长,建议拆分为多个短句以提高可读性",
|
||
position=text.find(s)
|
||
))
|
||
score -= 3
|
||
|
||
return max(0, score), errors
|
||
|
||
def score_content(self, text: str, title: Optional[str], genre: str, grade: int) -> Tuple[float, List[str]]:
|
||
"""
|
||
评估内容质量(满分100)
|
||
检查:主题相关性、内容丰富度、逻辑连贯性、情感表达
|
||
"""
|
||
comments = []
|
||
score = 85.0 # 基础分(内容难以精确量化,给予较高基础分)
|
||
|
||
char_stats = self._text_analyzer.count_characters(text)
|
||
sentences = self._text_analyzer.split_sentences(text)
|
||
|
||
# 内容丰富度:通过不同词汇的数量粗略评估
|
||
unique_chars = set(c for c in text if '\u4e00' <= c <= '\u9fff')
|
||
vocab_richness = len(unique_chars) / max(char_stats["chinese"], 1)
|
||
if vocab_richness > 0.6:
|
||
score += 10
|
||
comments.append("词汇丰富,用词多样化")
|
||
elif vocab_richness < 0.3:
|
||
score -= 10
|
||
comments.append("词汇较为单一,建议使用更丰富的词语表达")
|
||
|
||
# 逻辑连贯性:检查是否使用连接词
|
||
connectors = ['因此', '所以', '但是', '然而', '首先', '其次', '最后', '总之',
|
||
'不仅', '而且', '虽然', '但', '因为', '于是']
|
||
used_connectors = [c for c in connectors if c in text]
|
||
if len(used_connectors) >= 3:
|
||
score += 5
|
||
comments.append("逻辑衔接词使用恰当,行文连贯")
|
||
elif len(used_connectors) == 0 and len(sentences) > 5:
|
||
score -= 5
|
||
comments.append("缺少逻辑连接词,建议增加过渡衔接使行文更连贯")
|
||
|
||
# 情感表达评估
|
||
emotion_words = ['开心', '快乐', '高兴', '感动', '难过', '伤心', '惊讶',
|
||
'温暖', '幸福', '骄傲', '担心', '紧张']
|
||
used_emotions = [w for w in emotion_words if w in text]
|
||
if used_emotions:
|
||
score += 3
|
||
comments.append("有恰当的情感表达,增强了文章感染力")
|
||
|
||
return min(100, max(0, score)), comments
|
||
|
||
def score_rhetoric(self, text: str, grade: int) -> Tuple[float, List[str]]:
|
||
"""
|
||
评估修辞运用(满分100)
|
||
检查:修辞手法的使用数量和质量
|
||
"""
|
||
comments = []
|
||
score = 70.0 # 基础分
|
||
|
||
rhetorics = self._text_analyzer.detect_rhetoric(text)
|
||
|
||
# 根据检测到的修辞数量加分
|
||
rhetoric_types = set(r["type"] for r in rhetorics)
|
||
if len(rhetoric_types) >= 3:
|
||
score += 25
|
||
comments.append(f"修辞手法运用丰富,使用了{len(rhetoric_types)}种修辞手法")
|
||
elif len(rhetoric_types) >= 1:
|
||
score += 15
|
||
used_names = set(r["name"] for r in rhetorics)
|
||
comments.append(f"使用了{'、'.join(used_names)}等修辞手法")
|
||
else:
|
||
comments.append("建议适当使用比喻、排比等修辞手法增强表达效果")
|
||
|
||
# 高年级对修辞有更高要求
|
||
if grade >= 5 and len(rhetoric_types) < 2:
|
||
score -= 10
|
||
comments.append("该年级建议至少使用2种以上修辞手法")
|
||
|
||
return min(100, max(0, score)), comments
|
||
|
||
def review_essay(self, request: EssayReviewRequest) -> Dict:
|
||
"""
|
||
综合批改作文,返回总分和各维度分析结果
|
||
"""
|
||
start_time = time.time()
|
||
|
||
# 各维度独立评分
|
||
struct_score, struct_comments = self.score_structure(request.text, request.grade)
|
||
grammar_score, grammar_errors = self.score_grammar(request.text)
|
||
content_score, content_comments = self.score_content(
|
||
request.text, request.title, request.genre, request.grade)
|
||
rhetoric_score, rhetoric_comments = self.score_rhetoric(request.text, request.grade)
|
||
|
||
# 按权重计算总分,并映射到满分值
|
||
weighted_score = (
|
||
struct_score * self.DIMENSION_WEIGHTS["structure"] +
|
||
grammar_score * self.DIMENSION_WEIGHTS["grammar"] +
|
||
content_score * self.DIMENSION_WEIGHTS["content"] +
|
||
rhetoric_score * self.DIMENSION_WEIGHTS["rhetoric"]
|
||
)
|
||
total_score = round(weighted_score / 100 * request.max_score, 1)
|
||
|
||
# 字数统计
|
||
char_stats = TextAnalyzer.count_characters(request.text)
|
||
|
||
# 生成综合评语
|
||
overall_comment = self._generate_overall_comment(
|
||
total_score, request.max_score, struct_comments,
|
||
content_comments, rhetoric_comments
|
||
)
|
||
|
||
elapsed = (time.time() - start_time) * 1000
|
||
|
||
result = {
|
||
"total_score": total_score,
|
||
"max_score": request.max_score,
|
||
"dimensions": {
|
||
"structure": round(struct_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["structure"], 1),
|
||
"grammar": round(grammar_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["grammar"], 1),
|
||
"content": round(content_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["content"], 1),
|
||
"rhetoric": round(rhetoric_score / 100 * request.max_score * self.DIMENSION_WEIGHTS["rhetoric"], 1),
|
||
},
|
||
"character_count": char_stats,
|
||
"overall_comment": overall_comment,
|
||
"structure_analysis": struct_comments,
|
||
"content_analysis": content_comments,
|
||
"rhetoric_analysis": rhetoric_comments,
|
||
"grammar_errors": [e.dict() for e in grammar_errors] if request.enable_suggestions else [],
|
||
"inference_time_ms": round(elapsed, 2)
|
||
}
|
||
return result
|
||
|
||
def _generate_overall_comment(self, score: float, max_score: int,
|
||
struct_comments: List, content_comments: List,
|
||
rhetoric_comments: List) -> str:
|
||
"""生成综合评语"""
|
||
ratio = score / max_score
|
||
if ratio >= 0.9:
|
||
prefix = "优秀!"
|
||
elif ratio >= 0.75:
|
||
prefix = "良好。"
|
||
elif ratio >= 0.6:
|
||
prefix = "中等。"
|
||
else:
|
||
prefix = "需要加强。"
|
||
|
||
suggestions = []
|
||
if struct_comments:
|
||
suggestions.append(struct_comments[0])
|
||
if content_comments:
|
||
suggestions.append(content_comments[0])
|
||
if rhetoric_comments:
|
||
suggestions.append(rhetoric_comments[0])
|
||
|
||
return f"{prefix}{';'.join(suggestions[:3])}"
|
||
|
||
|
||
# ==================== API路由定义 ====================
|
||
|
||
router = APIRouter(prefix="/api/v1", tags=["作文批改"])
|
||
_scoring_engine = EssayScoringEngine()
|
||
|
||
|
||
@router.post("/essay/review")
|
||
async def review_essay(request: EssayReviewRequest):
|
||
"""
|
||
AI作文评分与批改接口
|
||
POST /api/v1/essay/review
|
||
输入作文OCR识别文本,返回综合评分、各维度分析和修改建议
|
||
"""
|
||
try:
|
||
result = _scoring_engine.review_essay(request)
|
||
|
||
# 审计日志记录
|
||
logger.info(
|
||
f"作文批改完成: score={result['total_score']}/{request.max_score}, "
|
||
f"student={request.student_id}, assignment={request.assignment_id}, "
|
||
f"chars={result['character_count']['chinese']}, time={result['inference_time_ms']}ms"
|
||
)
|
||
return {"code": 200, "msg": "success", "data": result}
|
||
except Exception as e:
|
||
logger.error(f"作文批改异常: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"作文批改服务异常: {str(e)}")
|