import base64 import io from fastapi import UploadFile async def parse_file(file: UploadFile) -> dict: """Parse uploaded file and extract text content or base64 image data.""" content = await file.read() filename = file.filename or "" ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else "" if len(content) > 10 * 1024 * 1024: return {"text": "", "error": "文件大小超过10MB限制"} try: if ext == "txt": return {"text": _decode_text(content)} elif ext == "docx": return {"text": _parse_docx(content)} elif ext == "pdf": return {"text": _parse_pdf(content)} elif ext in ("png", "jpg", "jpeg", "gif", "bmp", "webp"): b64 = base64.b64encode(content).decode("utf-8") mime = f"image/{ext}" if ext != "jpg" else "image/jpeg" return {"text": "", "base64_image": b64, "mime_type": mime} else: return {"text": "", "error": f"不支持的文件格式: .{ext}"} except Exception as e: return {"text": "", "error": f"文件解析失败: {str(e)}"} def _decode_text(content: bytes) -> str: for encoding in ("utf-8", "gbk", "gb2312", "utf-16"): try: return content.decode(encoding) except (UnicodeDecodeError, LookupError): continue return content.decode("utf-8", errors="replace") def _parse_docx(content: bytes) -> str: from docx import Document doc = Document(io.BytesIO(content)) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return "\n".join(paragraphs) def _parse_pdf(content: bytes) -> str: from PyPDF2 import PdfReader reader = PdfReader(io.BytesIO(content)) texts = [] for page in reader.pages: text = page.extract_text() if text: texts.append(text) return "\n".join(texts)