CareerBot/app/services/file_parser.py
ln0422 96997daed0 Initial commit: CareerBot full-stack career showcase with AI chatbot
- FastAPI backend with SQLAlchemy ORM and SQLite
- AI chatbot with OpenAI-compatible LLM integration (SSE streaming)
- Admin panel for content management, LLM config, token management
- Anonymous access with 3-question limit, token-based access control
- Recruiter intent detection with admin notification
- Resume generator (JD-based, Markdown to Word export)
- Chinese localized public interface

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-07 20:36:38 +08:00

65 lines
1.9 KiB
Python

import base64
import io
from fastapi import UploadFile
async def parse_file(file: UploadFile) -> dict:
"""Parse uploaded file and extract text content or base64 image data."""
content = await file.read()
filename = file.filename or ""
ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
if len(content) > 10 * 1024 * 1024:
return {"text": "", "error": "文件大小超过10MB限制"}
try:
if ext == "txt":
return {"text": _decode_text(content)}
elif ext == "docx":
return {"text": _parse_docx(content)}
elif ext == "pdf":
return {"text": _parse_pdf(content)}
elif ext in ("png", "jpg", "jpeg", "gif", "bmp", "webp"):
b64 = base64.b64encode(content).decode("utf-8")
mime = f"image/{ext}" if ext != "jpg" else "image/jpeg"
return {"text": "", "base64_image": b64, "mime_type": mime}
else:
return {"text": "", "error": f"不支持的文件格式: .{ext}"}
except Exception as e:
return {"text": "", "error": f"文件解析失败: {str(e)}"}
def _decode_text(content: bytes) -> str:
for encoding in ("utf-8", "gbk", "gb2312", "utf-16"):
try:
return content.decode(encoding)
except (UnicodeDecodeError, LookupError):
continue
return content.decode("utf-8", errors="replace")
def _parse_docx(content: bytes) -> str:
from docx import Document
doc = Document(io.BytesIO(content))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n".join(paragraphs)
def _parse_pdf(content: bytes) -> str:
from PyPDF2 import PdfReader
reader = PdfReader(io.BytesIO(content))
texts = []
for page in reader.pages:
text = page.extract_text()
if text:
texts.append(text)
return "\n".join(texts)