CareerBot/app/services/file_parser.py

import base64
import io

from fastapi import UploadFile


async def parse_file(file: UploadFile) -> dict:
    """Parse uploaded file and extract text content or base64 image data."""
    content = await file.read()
    filename = file.filename or ""
    ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""

    if len(content) > 10 * 1024 * 1024:
        return {"text": "", "error": "文件大小超过10MB限制"}

    try:
        if ext == "txt":
            return {"text": _decode_text(content)}

        elif ext == "docx":
            return {"text": _parse_docx(content)}

        elif ext == "pdf":
            return {"text": _parse_pdf(content)}

        elif ext in ("png", "jpg", "jpeg", "gif", "bmp", "webp"):
            b64 = base64.b64encode(content).decode("utf-8")
            mime = f"image/{ext}" if ext != "jpg" else "image/jpeg"
            return {"text": "", "base64_image": b64, "mime_type": mime}

        else:
            return {"text": "", "error": f"不支持的文件格式: .{ext}"}

    except Exception as e:
        return {"text": "", "error": f"文件解析失败: {str(e)}"}


def _decode_text(content: bytes) -> str:
    for encoding in ("utf-8", "gbk", "gb2312", "utf-16"):
        try:
            return content.decode(encoding)
        except (UnicodeDecodeError, LookupError):
            continue
    return content.decode("utf-8", errors="replace")


def _parse_docx(content: bytes) -> str:
    from docx import Document

    doc = Document(io.BytesIO(content))
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    return "\n".join(paragraphs)


def _parse_pdf(content: bytes) -> str:
    from PyPDF2 import PdfReader

    reader = PdfReader(io.BytesIO(content))
    texts = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            texts.append(text)
    return "\n".join(texts)